From 94c3c82999fc734abc57081631709b5fb8acf81b Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 29 Apr 2024 15:17:17 +0100
Subject: [PATCH 01/76] Fix new clang -std=c99 -pedantic warnings about func()

---
 bgzip.c                          |  2 +-
 header.c                         |  2 +-
 hfile.c                          |  4 ++--
 hfile_libcurl.c                  |  2 +-
 hfile_s3_write.c                 |  2 +-
 hts.c                            |  6 +++---
 sam.c                            |  4 ++--
 test/plugins-dlhts.c             |  2 +-
 test/sam.c                       | 18 +++++++++---------
 test/test-bcf_set_variant_type.c |  2 +-
 test/test-vcf-api.c              |  2 +-
 test/test_kstring.c              |  4 ++--
 textutils.c                      |  2 +-
 vcf.c                            |  2 +-
 14 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/bgzip.c b/bgzip.c
index 129343fb5..d795c80a9 100644
--- a/bgzip.c
+++ b/bgzip.c
@@ -57,7 +57,7 @@ static void error(const char *format, ...)
     exit(EXIT_FAILURE);
 }
 
-static int ask_yn()
+static int ask_yn(void)
 {
     char line[1024];
     if (fgets(line, sizeof line, stdin) == NULL)
diff --git a/header.c b/header.c
index 5161034f4..7f62074f0 100644
--- a/header.c
+++ b/header.c
@@ -2358,7 +2358,7 @@ void sam_hdr_incr_ref(sam_hdr_t *bh) {
  * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
  *         NULL on failure
  */
-sam_hrecs_t *sam_hrecs_new() {
+sam_hrecs_t *sam_hrecs_new(void) {
     sam_hrecs_t *hrecs = calloc(1, sizeof(*hrecs));
 
     if (!hrecs)
diff --git a/hfile.c b/hfile.c
index fc87049ca..1241dcccb 100644
--- a/hfile.c
+++ b/hfile.c
@@ -976,7 +976,7 @@ void hfile_shutdown(int do_close_plugin)
     pthread_mutex_unlock(&plugins_lock);
 }
 
-static void hfile_exit()
+static void hfile_exit(void)
 {
     hfile_shutdown(0);
     pthread_mutex_destroy(&plugins_lock);
@@ -1082,7 +1082,7 @@ static int init_add_plugin(void *obj, int (*init)(struct hFILE_plugin *),
  * Returns 0 on success,
  *        <0 on failure
  */
-static int load_hfile_plugins()
+static int load_hfile_plugins(void)
 {
     static const struct hFILE_scheme_handler
         data = { hopen_mem, hfile_always_local, "built-in", 80 },
diff --git a/hfile_libcurl.c b/hfile_libcurl.c
index e70550eab..3463acf43 100644
--- a/hfile_libcurl.c
+++ b/hfile_libcurl.c
@@ -277,7 +277,7 @@ static void free_auth(auth_token *tok) {
     free(tok);
 }
 
-static void libcurl_exit()
+static void libcurl_exit(void)
 {
     if (curl_share_cleanup(curl.share) == CURLSHE_OK)
         curl.share = NULL;
diff --git a/hfile_s3_write.c b/hfile_s3_write.c
index d54945839..a501645ca 100644
--- a/hfile_s3_write.c
+++ b/hfile_s3_write.c
@@ -822,7 +822,7 @@ static hFILE *vhopen_s3_write(const char *url, const char *mode, va_list args) {
 }
 
 
-static void s3_write_exit() {
+static void s3_write_exit(void) {
     if (curl_share_cleanup(curl.share) == CURLSHE_OK)
         curl.share = NULL;
 
diff --git a/hts.c b/hts.c
index cf0a07d9f..06b1b3fb9 100644
--- a/hts.c
+++ b/hts.c
@@ -81,7 +81,7 @@ KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
 HTSLIB_EXPORT
 int hts_verbose = HTS_LOG_WARNING;
 
-const char *hts_version()
+const char *hts_version(void)
 {
     return HTS_VERSION_TEXT;
 }
@@ -5050,7 +5050,7 @@ int hts_resize_array_(size_t item_size, size_t num, size_t size_sz,
     return 0;
 }
 
-void hts_lib_shutdown()
+void hts_lib_shutdown(void)
 {
     hfile_shutdown(1);
 }
@@ -5064,7 +5064,7 @@ void hts_set_log_level(enum htsLogLevel level)
     hts_verbose = level;
 }
 
-enum htsLogLevel hts_get_log_level()
+enum htsLogLevel hts_get_log_level(void)
 {
     return hts_verbose;
 }
diff --git a/sam.c b/sam.c
index 1a5519410..42bcbb5a7 100644
--- a/sam.c
+++ b/sam.c
@@ -104,7 +104,7 @@ const int8_t bam_cigar_table[256] = {
     -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
 };
 
-sam_hdr_t *sam_hdr_init()
+sam_hdr_t *sam_hdr_init(void)
 {
     sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
     if (bh == NULL) return NULL;
@@ -421,7 +421,7 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
  *** BAM alignment I/O ***
  *************************/
 
-bam1_t *bam_init1()
+bam1_t *bam_init1(void)
 {
     return (bam1_t*)calloc(1, sizeof(bam1_t));
 }
diff --git a/test/plugins-dlhts.c b/test/plugins-dlhts.c
index f90e3bd74..33f432fbd 100644
--- a/test/plugins-dlhts.c
+++ b/test/plugins-dlhts.c
@@ -177,7 +177,7 @@ int main(int argc, char **argv)
 
 #else
 
-int main()
+int main(void)
 {
     printf("Tests skipped due to " SKIP "\n");
     return EXIT_SUCCESS;
diff --git a/test/sam.c b/test/sam.c
index f0eadbefe..09e4aecf5 100644
--- a/test/sam.c
+++ b/test/sam.c
@@ -1997,7 +1997,7 @@ static void test_mempolicy(void)
     }
 }
 
-static void test_bam_set1_minimal()
+static void test_bam_set1_minimal(void)
 {
     int r;
     bam1_t *bam = NULL;
@@ -2028,7 +2028,7 @@ static void test_bam_set1_minimal()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_full()
+static void test_bam_set1_full(void)
 {
     const char *qname = "!??AAA~~~~";
     const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH };
@@ -2075,7 +2075,7 @@ static void test_bam_set1_full()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_even_and_odd_seq_len()
+static void test_bam_set1_even_and_odd_seq_len(void)
 {
     const char *seq_even = "TGGACTACGA";
     const char *seq_odd  = "TGGACTACGAC";
@@ -2105,7 +2105,7 @@ static void test_bam_set1_even_and_odd_seq_len()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_with_seq_but_no_qual()
+static void test_bam_set1_with_seq_but_no_qual(void)
 {
     const char *seq = "TGGACTACGA";
 
@@ -2129,7 +2129,7 @@ static void test_bam_set1_with_seq_but_no_qual()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_validate_qname()
+static void test_bam_set1_validate_qname(void)
 {
     int r;
     bam1_t *bam = NULL;
@@ -2146,7 +2146,7 @@ static void test_bam_set1_validate_qname()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_validate_seq()
+static void test_bam_set1_validate_seq(void)
 {
     int r;
     bam1_t *bam = NULL;
@@ -2163,7 +2163,7 @@ static void test_bam_set1_validate_seq()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_validate_cigar()
+static void test_bam_set1_validate_cigar(void)
 {
     const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH };
     const char *seq = "TGGACTACGA";
@@ -2192,7 +2192,7 @@ static void test_bam_set1_validate_cigar()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_validate_size_limits()
+static void test_bam_set1_validate_size_limits(void)
 {
     const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH };
     const char *seq = "TGGACTACGA";
@@ -2224,7 +2224,7 @@ static void test_bam_set1_validate_size_limits()
     if (bam != NULL) bam_destroy1(bam);
 }
 
-static void test_bam_set1_write_and_read_back()
+static void test_bam_set1_write_and_read_back(void)
 {
     const char *qname = "q1";
     const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH };
diff --git a/test/test-bcf_set_variant_type.c b/test/test-bcf_set_variant_type.c
index e5092084e..3688609f6 100644
--- a/test/test-bcf_set_variant_type.c
+++ b/test/test-bcf_set_variant_type.c
@@ -39,7 +39,7 @@ void error(const char *format, ...)
     exit(-1);
 }
 
-static void test_bcf_set_variant_type()
+static void test_bcf_set_variant_type(void)
 {
     // Test SNVs
     bcf_variant_t var1;
diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c
index eff653686..ff16fa194 100644
--- a/test/test-vcf-api.c
+++ b/test/test-vcf-api.c
@@ -625,7 +625,7 @@ void test_invalid_end_tag(void)
     hts_set_log_level(logging);
 }
 
-void test_open_format() {
+void test_open_format(void) {
     char mode[5];
     int ret;
     strcpy(mode, "r");
diff --git a/test/test_kstring.c b/test/test_kstring.c
index ee913a2e3..feb8243df 100644
--- a/test/test_kstring.c
+++ b/test/test_kstring.c
@@ -290,7 +290,7 @@ static char *mock_fgets(char *str, int num, void *p) {
     return str;
 }
 
-static int test_kgetline() {
+static int test_kgetline(void) {
     kstring_t s = KS_INITIALIZE;
     int mock_state = 0;
 
@@ -346,7 +346,7 @@ static ssize_t mock_fgets2(char *str, size_t num, void *p) {
     return strlen(str);
 }
 
-static int test_kgetline2() {
+static int test_kgetline2(void) {
     kstring_t s = KS_INITIALIZE;
     int mock_state = 0;
 
diff --git a/textutils.c b/textutils.c
index 0cc2af818..b2c29a893 100644
--- a/textutils.c
+++ b/textutils.c
@@ -220,7 +220,7 @@ static char token_type(hts_json_token *token)
 }
 
 HTSLIB_EXPORT
-hts_json_token * hts_json_alloc_token() {
+hts_json_token * hts_json_alloc_token(void) {
     return calloc(1, sizeof(hts_json_token));
 }
 
diff --git a/vcf.c b/vcf.c
index 9dec8481b..e8e4fce7a 100644
--- a/vcf.c
+++ b/vcf.c
@@ -1567,7 +1567,7 @@ int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h)
  *** BCF site I/O ***
  ********************/
 
-bcf1_t *bcf_init()
+bcf1_t *bcf_init(void)
 {
     bcf1_t *v;
     v = (bcf1_t*)calloc(1, sizeof(bcf1_t));

From 850dc64ea2bbe979c0edd39f59799db0c4a80759 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 29 Apr 2024 15:21:15 +0100
Subject: [PATCH 02/76] Move address sanitizer from clang-latest to gcc-latest.

The latest Ubuntu release appears to have a broken clang which misses
address sanitizer.  We don't particularly care which compiler we test
it on so this side-steps the CI issue.
---
 .cirrus.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index fc4405b08..5d2e3b6a7 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -73,7 +73,8 @@ gcc_task:
        USE_CONFIG: no
     - environment:
        USE_CONFIG: yes
-       CFLAGS: -std=c99 -pedantic -Wformat=2
+       CFLAGS: -std=c99 -pedantic -Wformat=2 -fsanitize=address
+       LDFLAGS: -fsanitize=address
        USE_LIBDEFLATE: yes
 
   install_script: |
@@ -108,8 +109,7 @@ ubuntu_task:
        DO_UNTRACKED_FILE_CHECK: yes
     - environment:
        USE_CONFIG: yes
-       CFLAGS: -g -Wall -O3 -fsanitize=address
-       LDFLAGS: -fsanitize=address
+       CFLAGS: -g -Wall -O3
        USE_LIBDEFLATE: yes
 
   # NB: we could consider building a docker image with these

From c93f5a57e63bc594a291b145407f1d8fcbef59bd Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 30 Apr 2024 09:25:33 +0100
Subject: [PATCH 03/76] Update htscodecs version to fix compiler void pedantry.

---
 htscodecs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htscodecs b/htscodecs
index ffda7310c..3865c8820 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit ffda7310c4b3292955561d6c3b1743cb82bfe26b
+Subproject commit 3865c88208d8e667bcc4d2bfd49541074b797d03

From 1e7efc0b9fb2472453dc22ccf30f57a6818d8585 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 22 Apr 2024 15:44:21 +0100
Subject: [PATCH 04/76] fix fuzz integer overflow in cram encoder.

Input files with very long CIGAR strings and consensus generated
embedded reference can lead to exceptionally long CRAM blocks which
overflow the check for large size fluctuations (to trigger new
compression metric assessments).

Reformulated the expression to avoid scaling up values.

Credit to OSS-Fuzz
Fixes oss-fuzz 68225
---
 cram/cram_io.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cram/cram_io.c b/cram/cram_io.c
index 247423354..7f7ffca49 100644
--- a/cram/cram_io.c
+++ b/cram/cram_io.c
@@ -1984,11 +1984,15 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s,
         // We also get large fluctuations based on genome coordinate for
         // e.g. SA:Z and SC series, but we consider the typical scale of
         // delta between blocks and use this to look for abnormality.
+
+        // Equivalent to (but minus possible integer overflow)
+        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
+        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
         if (metrics->input_avg_sz &&
-            (b->uncomp_size + 1000 > 4*(metrics->input_avg_sz+1000) ||
-             b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) &&
-            ABS(b->uncomp_size-metrics->input_avg_sz)
-                > 10*metrics->input_avg_delta) {
+            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
+             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
+            ABS(b->uncomp_size-metrics->input_avg_sz)/10
+                > metrics->input_avg_delta) {
             metrics->next_trial = 0;
         }
 

From 9a99a1d574a0438d7f4e8a81e60b315f653f4b68 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Sun, 21 Apr 2024 15:27:41 +1200
Subject: [PATCH 05/76] Check interval start to avoid overflowing bin numbers

Check start positions of query intervals against the maximum position
representable in the index's geometry, to avoid negative bin numbers
and the resulting infinite loops in the do...while loop.

Introduce hts_bin_maxpos() and hts_idx_maxpos(), and use them wherever the
maxpos calculation appears. (Leave the latter private, at least for now.)

Also change the existing end checks to <= as end is exclusive -- note it
is used as end-1 in the code guarded by the checks.
---
 hts.c        | 21 +++++++++++++++++----
 htslib/hts.h |  7 +++++++
 tbx.c        |  2 +-
 vcf.c        |  2 +-
 4 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/hts.c b/hts.c
index 06b1b3fb9..1021cf748 100644
--- a/hts.c
+++ b/hts.c
@@ -2446,9 +2446,14 @@ int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
     return ret;
 }
 
+static inline hts_pos_t hts_idx_maxpos(const hts_idx_t *idx)
+{
+    return hts_bin_maxpos(idx->min_shift, idx->n_lvls);
+}
+
 int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
 {
-    int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3);
+    hts_pos_t maxpos = hts_idx_maxpos(idx);
     if (tid < 0 || (beg <= maxpos && end <= maxpos))
         return 0;
 
@@ -3341,6 +3346,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
     khint_t k;
     bidx_t *bidx;
     uint64_t min_off, max_off;
+    hts_pos_t idx_maxpos;
     hts_itr_t *iter;
     uint32_t unmapped = 0, rel_off;
 
@@ -3385,6 +3391,9 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
 
             if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
 
+            idx_maxpos = hts_idx_maxpos(idx);
+            if (beg >= idx_maxpos) { iter->finished = 1; return iter; }
+
             rel_off = beg>>idx->min_shift;
             // compute min_off
             bin = hts_bin_first(idx->n_lvls) + rel_off;
@@ -3427,7 +3436,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
             // compute max_off: a virtual offset from a bin to the right of end
             // First check if end lies within the range of the index (it won't
             // if it's HTS_POS_MAX)
-            if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) {
+            if (end <= idx_maxpos) {
                 bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
                 if (bin >= idx->n_bins) bin = 0;
                 while (1) {
@@ -3513,7 +3522,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
     bidx_t *bidx;
     uint64_t min_off, max_off, t_off = (uint64_t)-1;
     int tid;
-    hts_pos_t beg, end;
+    hts_pos_t beg, end, idx_maxpos;
     hts_reglist_t *curr_reg;
     uint32_t unmapped = 0, rel_off;
 
@@ -3555,6 +3564,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
             else
                 unmapped = 1;
 
+            idx_maxpos = hts_idx_maxpos(idx);
+
             for(j=0; j<curr_reg->count; j++) {
                 hts_pair32_t *curr_intv = &curr_reg->intervals[j];
                 if (curr_intv->end < curr_intv->beg)
@@ -3562,6 +3573,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
 
                 beg = curr_intv->beg;
                 end = curr_intv->end;
+                if (beg >= idx_maxpos)
+                    continue;
                 rel_off = beg>>idx->min_shift;
 
                 /* Compute 'min_off' by searching the lowest level bin containing 'beg'.
@@ -3606,7 +3619,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter)
                 // compute max_off: a virtual offset from a bin to the right of end
                 // First check if end lies within the range of the index (it
                 // won't if it's HTS_POS_MAX)
-                if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) {
+                if (end <= idx_maxpos) {
                     bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1;
                     if (bin >= idx->n_bins) bin = 0;
                     while (1) {
diff --git a/htslib/hts.h b/htslib/hts.h
index 4c54be056..870ea4532 100644
--- a/htslib/hts.h
+++ b/htslib/hts.h
@@ -1534,6 +1534,13 @@ static inline int hts_bin_bot(int bin, int n_lvls)
     return (bin - hts_bin_first(l)) << (n_lvls - l) * 3;
 }
 
+/// Compute the (0-based exclusive) maximum position covered by a binning index
+static inline hts_pos_t hts_bin_maxpos(int min_shift, int n_lvls)
+{
+    hts_pos_t one = 1;
+    return one << (min_shift + n_lvls * 3);
+}
+
 /**************
  * Endianness *
  **************/
diff --git a/tbx.c b/tbx.c
index 5f861299a..ade2e9f09 100644
--- a/tbx.c
+++ b/tbx.c
@@ -321,7 +321,7 @@ static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len)
 // files with very large contigs.
 static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len)
 {
-    int64_t s = 1LL << (min_shift + n_lvls * 3);
+    int64_t s = hts_bin_maxpos(min_shift, n_lvls);
     max_len += 256;
     for (; max_len > s; ++n_lvls, s <<= 3) {}
     return n_lvls;
diff --git a/vcf.c b/vcf.c
index e8e4fce7a..53f2b7a92 100644
--- a/vcf.c
+++ b/vcf.c
@@ -4288,7 +4288,7 @@ static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift,
     }
     if ( !max_len ) max_len = (1LL<<31) - 1;  // In case contig line is broken.
     max_len += 256;
-    s = 1LL << (min_shift + starting_n_lvls * 3);
+    s = hts_bin_maxpos(min_shift, starting_n_lvls);
     for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3);
 
     if (nids_out) *nids_out = nids;

From ab7c09f0a972a6fbaf6085dfa83203096f4d7d4d Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 29 Feb 2024 09:52:30 +0000
Subject: [PATCH 06/76] Fix cram_index_query to iterate correct for ref
 HTS_IDX_NOCOOR.

cram_index_query_last does a loop on cram_index_query with the
previous index entry in "from".  This scans to find the last
container.  If we're doing a query of ref "*" however it comes in as
reference HTS_IDX_NOCOOR (-2) and fails the refid matching check.

This makes cram_index_query_last now work again for region "*".
---
 cram/cram_index.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cram/cram_index.c b/cram/cram_index.c
index 0908736ab..639bc4c41 100644
--- a/cram/cram_index.c
+++ b/cram/cram_index.c
@@ -410,6 +410,9 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
         // Continue from a previous search.
         // We switch to just scanning the linked list, as the nested
         // lists are typically short.
+        if (refid == HTS_IDX_NOCOOR)
+            refid = -1;
+
         e = from->e_next;
         if (e && e->refid == refid && e->start <= pos)
             return e;
@@ -423,6 +426,7 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
         // fail, or already there, dealt with elsewhere.
         return NULL;
 
+    case -1:
     case HTS_IDX_NOCOOR:
         refid = -1;
         pos = 0;

From 7576aca19938147dda7688ab685be4d7e5a0cd35 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 7 Mar 2024 15:14:55 +0000
Subject: [PATCH 07/76] Provide extra CRAM container manipulations and index
 queries.

Added to support extra functionality to `samtools cat`.

- Some internal cram functions are no longer static as they're called
  from cram_external.c, but they don't have HTSLIB_EXPORT and aren't
  an official part of the API.
  These are cram_to_bam, cram_next_slice

- New public CRAM APIs:
  These facilitate manipulation at the container level, both seeking
  to specific byte offsets, but also being able to specify containers
  as the n^th container listed in the index.

  cram_container_get_coords returns refid, start and span fields from
  the opaque cram_container struct.

  cram_filter_container copies a container but applies region based
  filtering, as already specified in the cram_fd with a range request.
  (Note we currently also provide cram_copy_slice, but may want to add
  a cram_copy_container for consistency.)

  cram_index_extents queries an index to return byte offsets of the
  first and last container overlapping a specified region.

  cram_num_containers_between queries an index to report the number of
  indexed containers and their container numbers (starting at 0 for
  the first) covering a range.

  cram_num_containers is a simplified cram_num_containers_between
  doing only the counting operation and on the entire file.

  cram_container_num2offset returns the byte offset for the n^th
  container.  cram_container_offset2num does the reverse.

- A new cram_skip_container function, which is currently internal only
  but may potentially have use externally in the future.  It's used by
  cram_filter_container when it detects it'll filter out everything.

- cram_index_query now copes with HTS_IDX_NOCOOR (-2) and maps it
  over to refid -1.
---
 cram/cram_decode.c   |   8 +-
 cram/cram_decode.h   |  27 +++++-
 cram/cram_external.c | 200 ++++++++++++++++++++++++++++++++++++++++++-
 cram/cram_index.c    | 192 ++++++++++++++++++++++++++++++++++++++++-
 htslib/cram.h        |  73 ++++++++++++++++
 5 files changed, 493 insertions(+), 7 deletions(-)

diff --git a/cram/cram_decode.c b/cram/cram_decode.c
index 86e2ef96e..2b2ad6029 100644
--- a/cram/cram_decode.c
+++ b/cram/cram_decode.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd.
+Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -3004,8 +3004,8 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
  * Returns the used size of the bam record on success
  *         -1 on failure.
  */
-static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
-                       cram_record *cr, int rec, bam_seq_t **bam) {
+int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
+                cram_record *cr, int rec, bam_seq_t **bam) {
     int ret, rg_len;
     char name_a[1024], *name;
     int name_len;
@@ -3172,7 +3172,7 @@ static cram_container *cram_first_slice(cram_fd *fd) {
     return c;
 }
 
-static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
+cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
     cram_container *c_curr;  // container being consumed via cram_get_seq()
     cram_slice *s_curr = NULL;
 
diff --git a/cram/cram_decode.h b/cram/cram_decode.h
index 400eb6beb..16d87a073 100644
--- a/cram/cram_decode.h
+++ b/cram/cram_decode.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2012-2013, 2018 Genome Research Ltd.
+Copyright (c) 2012-2013, 2018, 2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -94,6 +94,15 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
 cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b);
 
 
+/*! INTERNAL:
+ * Loads and decodes the next slice worth of data.
+ *
+ * @return
+ * Returns cram slice pointer on success;
+ *         NULL on failure
+ */
+cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp);
+
 /*! INTERNAL:
  * Decode an entire slice from container blocks. Fills out s->crecs[] array.
  *
@@ -105,6 +114,22 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
                       sam_hdr_t *hdr);
 
 
+/*! INTERNAL:
+ * Converts a cram in-memory record into a bam in-memory record. We
+ * pass a pointer to a bam_seq_t pointer along with the a pointer to
+ * the allocated size. These can initially be pointers to NULL and zero.
+ *
+ * This function will reallocate the bam buffer as required and update
+ * (*bam)->alloc accordingly, allowing it to be used within a loop
+ * efficiently without needing to allocate new bam objects over and
+ * over again.
+ *
+ * Returns the used size of the bam record on success
+ *         -1 on failure.
+ */
+int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
+                cram_record *cr, int rec, bam_seq_t **bam);
+
 /*
  * Drains and frees the decode read-queue for a multi-threaded reader.
  */
diff --git a/cram/cram_external.c b/cram/cram_external.c
index 7455185ad..1102e8daa 100644
--- a/cram/cram_external.c
+++ b/cram/cram_external.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015, 2018-2020, 2022-2023 Genome Research Ltd.
+Copyright (c) 2015, 2018-2020, 2022-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #include "../htslib/hfile.h"
+#include "../hfile_internal.h"
 #include "cram.h"
 
 /*
@@ -121,6 +122,16 @@ int cram_container_is_empty(cram_fd *fd) {
     return fd->empty_container;
 }
 
+void cram_container_get_coords(cram_container *c,
+                               int *refid, hts_pos_t *start, hts_pos_t *span) {
+    if (refid)
+        *refid = c->ref_seq_id;
+    if (start)
+        *start = c->ref_seq_start;
+    if (span)
+        *span  = c->ref_seq_span;
+}
+
 
 /*
  *-----------------------------------------------------------------------------
@@ -683,6 +694,7 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
             cram_free_block(blk);
             return -1;
         }
+
         if (cram_write_block(out, blk) != 0) {
             cram_free_block(blk);
             return -1;
@@ -704,6 +716,192 @@ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) {
     return 0;
 }
 
+/*
+ * Discards the next containers worth of data.
+ * Only the cram structure has been read so far.
+ *
+ * Returns 0 on success,
+ *        -1 on failure
+ */
+static int cram_skip_container(cram_fd *in, cram_container *c) {
+    // Compression header
+    cram_block *blk;
+    if (!(blk = cram_read_block(in)))
+        return -1;
+    cram_free_block(blk);
+
+    int i;
+    for (i = 0; i < c->num_landmarks; i++) {
+        cram_block_slice_hdr *hdr;
+
+        if (!(blk = cram_read_block(in)))
+            return -1;
+        if (!(hdr = cram_decode_slice_header(in, blk))) {
+            cram_free_block(blk);
+            return -1;
+        }
+        cram_free_block(blk);
+
+        int num_blocks = cram_slice_hdr_get_num_blocks(hdr), j;
+        for (j = 0; j < num_blocks; j++) {
+            blk = cram_read_block(in);
+            if (!blk) {
+                cram_free_slice_header(hdr);
+                return -1;
+            }
+            cram_free_block(blk);
+        }
+        cram_free_slice_header(hdr);
+    }
+
+    return 0;
+}
+
+
+/*
+ * Copies a container, but filtering it down to a specific region,
+ * which has already been set on the 'in' fd.
+ *
+ * This is used in e.g. samtools cat where we specified a region and discover
+ * that a region doesn't entirely span the container, so we have to select
+ * which reads we need to copy out of it.
+ *
+ * If ref_id is non-NULL we also return the last ref_id we filtered.
+ * This can be -2 if it's multi-ref and we observe more than one reference,
+ * and actual ref_id >= -1 if it's multi-ref and we observe just one ref or
+ * it's fixed reference.
+ *
+ * Returns 0 on success
+ *        -1 on error
+ */
+int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c,
+                          int *ref_id) {
+    int err = 0, fixed_ref = -3;
+
+    if (ref_id)
+        *ref_id = c->ref_seq_id;
+
+    int rid = in->range.refid == -2 ? -1 : in->range.refid;
+    if (rid != c->ref_seq_id ||
+        in->range.start > c->ref_seq_start + c->ref_seq_span-1)
+        // Except for multi-ref cases
+        if (c->ref_seq_id != -2)
+            return cram_skip_container(in, c);
+
+    // Container compression header
+    cram_block *blk = cram_read_block(in);
+    if (!blk)
+        return -1;
+    c->comp_hdr = cram_decode_compression_header(in, blk);
+    in->ctr = c;
+
+    // If it's multi-ref but a constant ref-id, then we can still do
+    // basic level chromosome filtering.  Similarly multi-ref where we're
+    // _already_ in ref "*" (unmapped) means we can just copy the container
+    // as there are no positions to filter on and "*" sorts to the end.
+    // TODO: how to tell "already in" though?
+    if (c->ref_seq_id == -2) {
+        cram_codec *cd = c->comp_hdr->codecs[DS_RI];
+        if (cd && cd->codec == E_HUFFMAN && cd->u.huffman.ncodes == 1 &&
+            // this check should be always true anyway
+            rid == cd->u.huffman.codes[0].symbol)
+            // We're in multi-ref mode, but actually the entire container
+            // matches.  So if we're in whole-chromosome mode we can just
+            // copy.
+            if (in->range.start <= 1 &&
+                in->range.end >= (INT64_MAX&(0xffffffffULL<<32))) {
+                if (ref_id)
+                    *ref_id = rid;
+                err |= cram_write_container(out, c) < 0;
+                err |= cram_write_block(out, blk);
+                return cram_copy_slice(in, out, c->num_landmarks) | -err;
+            }
+    }
+
+    // A simple read-write loop with region filtering automatically due to
+    // an earlier CRAM_OPT_RANGE request.
+    //
+    // We can hit EOF when reaching the end of the range, but we still need
+    // to manually check we don't attempt to read beyond this single container.
+
+    cram_range rng_copy = in->range;
+    in->range.start = INT64_MIN;
+    in->range.end = INT64_MAX;
+
+    bam1_t *b = bam_init1();
+    while ((c->curr_slice < c->max_slice ||
+            c->slice->curr_rec < c->slice->max_rec)) {
+        cram_slice *s;
+        if (c->slice && c->slice->curr_rec < c->slice->max_rec)
+            s = c->slice;
+        else if (c->curr_slice < c->max_slice)
+            s = cram_next_slice(in, &c);
+        else
+            break; // end of container
+        c->slice = s;
+
+        // This is more efficient if we check as a cram record instead of a
+        // bam record as we don't have to parse CIGAR end.
+        cram_record *cr = &c->slice->crecs[c->slice->curr_rec];
+        if (fixed_ref == -3)
+            fixed_ref = cr->ref_id;
+        else if (fixed_ref != cr->ref_id)
+            fixed_ref = -2;
+
+        if (rng_copy.refid != cr->ref_id) {
+            if (rng_copy.refid == -2) {
+                if (cr->ref_id > -1) {
+                    // Want unmapped, but have mapped
+                    c->slice->curr_rec++;
+                    continue;
+                }
+            } else {
+                if (rng_copy.refid > cr->ref_id || rng_copy.refid == -1) {
+                    // multi-ref and not at the correct ref yet
+                    c->slice->curr_rec++;
+                    continue;
+                } else {
+                    // multi-ref and beyond the desired ref
+                    break;
+                }
+            }
+        }
+
+        // Correct ref, but check the desired region
+        if (cr->aend < rng_copy.start) {
+            c->slice->curr_rec++;
+            continue;
+        }
+        if (cr->apos > rng_copy.end)
+            break;
+
+        // Broadly rquivalent to cram_get_bam_seq, but starting from 'cr'
+        err |= cram_to_bam(in->header, in, s, cr, s->curr_rec++, &b) < 0;
+
+        if (cram_put_bam_seq(out, b) < 0) {
+            err |= 1;
+            break;
+        }
+    }
+    bam_destroy1(b);
+
+    if (ref_id)
+        *ref_id = fixed_ref;
+
+    in->range = rng_copy;
+
+    // Avoids double frees as we stole the container from our other
+    // file descriptor.
+    in->ctr    = NULL;
+    in->ctr_mt = NULL;
+
+    err |= cram_flush(out);
+    cram_free_block(blk);
+
+    return -err;
+}
+
+
 /*
  * Renumbers RG numbers in a cram compression header.
  *
diff --git a/cram/cram_index.c b/cram/cram_index.c
index 639bc4c41..77c953d6c 100644
--- a/cram/cram_index.c
+++ b/cram/cram_index.c
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2013-2020, 2023 Genome Research Ltd.
+Copyright (c) 2013-2020, 2023-2024 Genome Research Ltd.
 Author: James Bonfield <jkb@sanger.ac.uk>
 
 Redistribution and use in source and binary forms, with or without
@@ -848,3 +848,193 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
 
     return (bgzf_close(fp) >= 0)? 0 : -4;
 }
+
+// internal recursive step
+static int64_t cram_num_containers_between_(cram_index *e, int64_t *last_pos,
+                                            int64_t nct,
+                                            off_t cstart, off_t cend,
+                                            int64_t *first, int64_t *last) {
+    int64_t nc = 0, i;
+
+    if (e->offset) {
+        if (e->offset != *last_pos) {
+            if (e->offset >= cstart && (!cend || e->offset <= cend)) {
+                if (first && *first < 0)
+                    *first = nct;
+                if (last)
+                    *last = nct;
+            }
+            nc++;
+        }
+        // else a new multi-ref in same container
+        *last_pos = e->offset;
+    }
+
+    for (i = 0; i < e->nslice; i++)
+        nc += cram_num_containers_between_(&e->e[i], last_pos, nc + nct,
+                                           cstart, cend, first, last);
+
+    return nc;
+}
+
+/*! Returns the number of containers in the CRAM file within given offsets.
+ *
+ * The cstart and cend offsets are the locations of the start of containers
+ * as returned by index_container_offset.
+ *
+ * If non-NULL, first and last will hold the inclusive range of container
+ * numbers, counting from zero.
+ *
+ * @return
+ * Returns the number of containers, equivalent to *last-*first+1.
+ */
+int64_t cram_num_containers_between(cram_fd *fd,
+                                    off_t cstart, off_t cend,
+                                    int64_t *first, int64_t *last) {
+    int64_t nc = 0, i;
+    int64_t last_pos = -99;
+    int64_t l_first = -1, l_last = -1;
+
+    for (i = 0; i < fd->index_sz; i++) {
+        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
+        nc += cram_num_containers_between_(&fd->index[j], &last_pos, nc,
+                                           cstart, cend, &l_first, &l_last);
+    }
+
+    if (first)
+        *first = l_first;
+    if (last)
+        *last = l_last;
+
+    return l_last - l_first + 1;
+}
+
+/*
+ * Queries the total number of distinct containers in the index.
+ * Note there may be more containers in the file than in the index, as we
+ * are not required to have an index entry for every one.
+ */
+int64_t cram_num_containers(cram_fd *fd) {
+    return cram_num_containers_between(fd, 0, 0, NULL, NULL);
+}
+
+
+/*! Returns the byte offset for the start of the n^th container.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+static cram_index *cram_container_num2offset_(cram_index *e, int num,
+                                              int64_t *last_pos, int *nc) {
+    if (e->offset) {
+        if (e->offset != *last_pos) {
+            if (*nc == num)
+                return e;
+            (*nc)++;
+        }
+        // else a new multi-ref in same container
+        *last_pos = e->offset;
+    }
+
+    int i;
+    for (i = 0; i < e->nslice; i++) {
+        cram_index *tmp = cram_container_num2offset_(&e->e[i], num,
+                                                     last_pos, nc);
+        if (tmp)
+            return tmp;
+    }
+
+
+    return NULL;
+}
+
+off_t cram_container_num2offset(cram_fd *fd, int64_t num) {
+    int nc = 0, i;
+    int64_t last_pos = -9;
+    cram_index *e = NULL;
+
+    for (i = 0; i < fd->index_sz; i++) {
+        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
+        if (!fd->index[j].nslice)
+            continue;
+        if ((e = cram_container_num2offset_(&fd->index[j], num,
+                                            &last_pos, &nc)))
+            break;
+    }
+
+    return e ? e->offset : -1;
+}
+
+
+/*! Returns the container number for the first container at offset >= pos.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+static cram_index *cram_container_offset2num_(cram_index *e, off_t pos,
+                                              int64_t *last_pos, int *nc) {
+    if (e->offset) {
+        if (e->offset != *last_pos) {
+            if (e->offset >= pos)
+                return e;
+            (*nc)++;
+        }
+        // else a new multi-ref in same container
+        *last_pos = e->offset;
+    }
+
+    int i;
+    for (i = 0; i < e->nslice; i++) {
+        cram_index *tmp = cram_container_offset2num_(&e->e[i], pos,
+                                                     last_pos, nc);
+        if (tmp)
+            return tmp;
+    }
+
+
+    return NULL;
+}
+
+int64_t cram_container_offset2num(cram_fd *fd, off_t pos) {
+    int nc = 0, i;
+    int64_t last_pos = -9;
+    cram_index *e = NULL;
+
+    for (i = 0; i < fd->index_sz; i++) {
+        int j = i+1 == fd->index_sz ? 0 : i+1; // maps "*" to end
+        if (!fd->index[j].nslice)
+            continue;
+        if ((e = cram_container_offset2num_(&fd->index[j], pos,
+                                            &last_pos, &nc)))
+            break;
+    }
+
+    return e ? nc : -1;
+}
+
+/*!
+ * Returns the file offsets of CRAM containers covering a specific region
+ * query.  Note both offsets are the START of the container.
+ *
+ * first will point to the start of the first overlapping container
+ * last will point to the start of the last overlapping container
+ *
+ * Returns 0 on success
+ *        <0 on failure
+ */
+int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end,
+                       off_t *first, off_t *last) {
+    cram_index *ci;
+
+    if (first) {
+        if (!(ci = cram_index_query(fd, refid, start, NULL)))
+            return -1;
+        *first = ci->offset;
+    }
+
+    if (last) {
+        if (!(ci = cram_index_query_last(fd, refid, end)))
+            return -1;
+        *last = ci->offset;
+    }
+
+    return 0;
+}
diff --git a/htslib/cram.h b/htslib/cram.h
index e0b51839c..841e4a9b6 100644
--- a/htslib/cram.h
+++ b/htslib/cram.h
@@ -209,6 +209,11 @@ HTSLIB_EXPORT
 int cram_container_is_empty(cram_fd *fd);
 
 
+/* Returns chromosome and start/span from container struct */
+HTSLIB_EXPORT
+void cram_container_get_coords(cram_container *c,
+                               int *refid, hts_pos_t *start, hts_pos_t *span);
+
 /*
  *-----------------------------------------------------------------------------
  * cram_block
@@ -329,6 +334,18 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out,
 HTSLIB_EXPORT
 int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice);
 
+/*
+ * Copies a container, but filtering it down to a specific region (as
+ * already specified in 'in'
+ *
+ * Returns 0 on success
+ *        -1 on EOF
+ *        -2 on error
+ */
+HTSLIB_EXPORT
+int cram_filter_container(cram_fd *in, cram_fd *out, cram_container *c,
+                          int *ref_id);
+
 /*
  * Decodes a CRAM block compression header.
  * Returns header ptr on success
@@ -744,6 +761,62 @@ static inline void sam_hdr_free(SAM_hdr *hdr) { sam_hdr_destroy(hdr); }
 HTSLIB_EXPORT
 refs_t *cram_get_refs(htsFile *fd);
 
+/*!
+ * Returns the file offsets of CRAM slices covering a specific region
+ * query.  Note both offsets are the START of the slice.
+ *
+ * first will point to the start of the first overlapping slice
+ * last will point to the start of the last overlapping slice
+ *
+ * @return
+ * Returns 0 on success
+ *        <0 on failure
+ */
+HTSLIB_EXPORT
+int cram_index_extents(cram_fd *fd, int refid, hts_pos_t start, hts_pos_t end,
+                       off_t *first, off_t *last);
+
+/*! Returns the total number of containers in the CRAM index.
+ *
+ * Note the index is not required to have an entry for every container, but it
+ * will always have an index entry for the start of each chromosome.
+ * (Although in practice our indices do container one entry per container.)
+ *
+ * This is equivalent to cram_num_containers_between(fd, 0, 0, NULL, NULL)
+ */
+HTSLIB_EXPORT
+int64_t cram_num_containers(cram_fd *fd);
+
+/*! Returns the number of containers in the CRAM index within given offsets.
+ *
+ * The cstart and cend offsets are the locations of the start of containers
+ * as returned by index_container_offset.
+ *
+ * If non-NULL, first and last will hold the inclusive range of container
+ * numbers, counting from zero.
+ *
+ * @return
+ * Returns the number of containers, equivalent to *last-*first+1.
+ */
+HTSLIB_EXPORT
+int64_t cram_num_containers_between(cram_fd *fd,
+                                    off_t cstart, off_t cend,
+                                    int64_t *first, int64_t *last);
+
+/*! Returns the byte offset for the start of the n^th container.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+HTSLIB_EXPORT
+off_t cram_container_num2offset(cram_fd *fd, int64_t n);
+
+/*! Returns the container number for the first container at offset >= pos.
+ *
+ * The index must have previously been loaded, otherwise <0 is returned.
+ */
+HTSLIB_EXPORT
+int64_t cram_container_offset2num(cram_fd *fd, off_t pos);
+
 /**@}*/
 
 #ifdef __cplusplus

From 0f5719a38a0d0026917423181c4c9c108c431399 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Tue, 26 Sep 2023 08:43:35 +0200
Subject: [PATCH 08/76] Enable SSSE3 intrinsics for nibble parsing

---
 sam_internal.h | 74 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 8 deletions(-)

diff --git a/sam_internal.h b/sam_internal.h
index b1fce9fe4..4755ba789 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -26,7 +26,9 @@ DEALINGS IN THE SOFTWARE.  */
 #include <errno.h>
 #include <stdint.h>
 #include "htslib/sam.h"
-
+#ifdef __SSSE3__
+#include "tmmintrin.h"
+#endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -87,15 +89,71 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) {
         "B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN"
         "N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN";
 
-    int i, len2 = len/2;
     seq[0] = 0;
-
-    for (i = 0; i < len2; i++)
+    const char *seq_end_ptr = seq + len;
+    char *seq_cursor = seq;
+    const uint8_t *nibble_cursor = nib;
+    const char *seq_end_ptr_twoatatime = seq + (len & (~1ULL));
+    #ifdef __SSSE3__
+    const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i));
+    __m128i first_upper_shuffle = _mm_setr_epi8(
+        0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff);
+    __m128i first_lower_shuffle = _mm_setr_epi8(
+        0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7);
+    __m128i second_upper_shuffle = _mm_setr_epi8(
+        8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff);
+    __m128i second_lower_shuffle = _mm_setr_epi8(
+        0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15);
+    __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
+    /* Work on 16 encoded characters at the time resulting in 32 decoded characters
+       Examples are given for 8 encoded characters A until H to keep it readable.
+        Encoded stored as |AB|CD|EF|GH|
+        Shuffle into |AB|00|CD|00|EF|00|GH|00| and
+                     |00|AB|00|CD|00|EF|00|GH|
+        Shift upper to the right resulting into
+                     |0A|B0|0C|D0|0E|F0|0G|H0| and
+                     |00|AB|00|CD|00|EF|00|GH|
+        Merge with or resulting into (X stands for garbage)
+                     |0A|XB|0C|XD|0E|XF|0G|XH|
+        Bitwise and with 0b1111 leads to:
+                     |0A|0B|0C|0D|0E|0F|0G|0H|
+        We can use the resulting 4-bit integers as indexes for the shuffle of
+        the nucleotide lookup. */
+    while (seq_cursor < seq_vec_end_ptr) {
+        __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);
+
+        __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
+        __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
+        __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
+        __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
+        __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111));
+        __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
+        _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);
+
+        __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
+        __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
+        __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
+        __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
+        __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111));
+        __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
+        _mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides);
+
+        nibble_cursor += sizeof(__m128i);
+        seq_cursor += 2 * sizeof(__m128i);
+    }
+    #endif
+    while (seq_cursor < seq_end_ptr_twoatatime) {
         // Note size_t cast helps gcc optimiser.
-        memcpy(&seq[i*2], &code2base[(size_t)nib[i]*2], 2);
-
-    if ((i *= 2) < len)
-        seq[i] = seq_nt16_str[bam_seqi(nib, i)];
+        memcpy(seq_cursor, code2base + ((size_t)*nibble_cursor * 2), 2);
+        seq_cursor += 2;
+        nibble_cursor += 1;
+    }
+    if (seq_cursor != seq_end_ptr) {
+        /* There is a single encoded nuc left */
+        uint8_t nibble_c = *nibble_cursor;
+        uint8_t upper_nuc_index = nibble_c >> 4;
+        seq_cursor[0] = seq_nt16_str[upper_nuc_index];
+    }
 }
 
 #ifdef __cplusplus

From 1b1b8ae521eec08ad8acf3f2d0ec5ffd9c35eccd Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Tue, 2 Apr 2024 09:08:42 +0200
Subject: [PATCH 09/76] Make a dynamic dispatcher function for nibble2base

---
 sam_internal.h | 74 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 24 deletions(-)

diff --git a/sam_internal.h b/sam_internal.h
index 4755ba789..f901070b6 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -25,10 +25,9 @@ DEALINGS IN THE SOFTWARE.  */
 
 #include <errno.h>
 #include <stdint.h>
+#include "htslib/hts_defs.h"
 #include "htslib/sam.h"
-#ifdef __SSSE3__
-#include "tmmintrin.h"
-#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -70,7 +69,7 @@ static inline int possibly_expand_bam_data(bam1_t *b, size_t bytes) {
  * for (i = 0; i < len; i++)
  *    seq[i] = seq_nt16_str[bam_seqi(nib, i)];
  */
-static inline void nibble2base(uint8_t *nib, char *seq, int len) {
+static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
     static const char code2base[512] =
         "===A=C=M=G=R=S=V=T=W=Y=H=K=D=B=N"
         "A=AAACAMAGARASAVATAWAYAHAKADABAN"
@@ -89,21 +88,42 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) {
         "B=BABCBMBGBRBSBVBTBWBYBHBKBDBBBN"
         "N=NANCNMNGNRNSNVNTNWNYNHNKNDNBNN";
 
+    int i, len2 = len/2;
+    seq[0] = 0;
+
+    for (i = 0; i < len2; i++)
+        // Note size_t cast helps gcc optimiser.
+        memcpy(&seq[i*2], &code2base[(size_t)nib[i]*2], 2);
+
+    if ((i *= 2) < len)
+        seq[i] = seq_nt16_str[bam_seqi(nib, i)];
+}
+
+#if HTS_GCC_AT_LEAST(4,8)
+/*
+ * Convert a nibble encoded BAM sequence to a string of bases.
+ *
+ * Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be
+ * unpacked into 32 indexes from 0-15. Using the pshufb instruction these can
+ * be converted to the IUPAC characters.
+ * It falls back on the nibble2base_default function for the remainder.
+ */
+#include "tmmintrin.h"
+__attribute__((target("ssse3")))
+static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
     seq[0] = 0;
     const char *seq_end_ptr = seq + len;
     char *seq_cursor = seq;
-    const uint8_t *nibble_cursor = nib;
-    const char *seq_end_ptr_twoatatime = seq + (len & (~1ULL));
-    #ifdef __SSSE3__
+    uint8_t *nibble_cursor = nib;
     const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i));
     __m128i first_upper_shuffle = _mm_setr_epi8(
-        0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7, 0xff);
+        0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1);
     __m128i first_lower_shuffle = _mm_setr_epi8(
-        0xff, 0, 0xff, 1, 0xff, 2, 0xff, 3, 0xff, 4, 0xff, 5, 0xff, 6, 0xff, 7);
+        -1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7);
     __m128i second_upper_shuffle = _mm_setr_epi8(
-        8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15, 0xff);
+        8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1);
     __m128i second_lower_shuffle = _mm_setr_epi8(
-        0xff, 8, 0xff, 9, 0xff, 10, 0xff, 11, 0xff, 12, 0xff, 13, 0xff, 14, 0xff, 15);
+        -1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15);
     __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
     /* Work on 16 encoded characters at the time resulting in 32 decoded characters
        Examples are given for 8 encoded characters A until H to keep it readable.
@@ -126,7 +146,7 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) {
         __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
         __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
         __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
-        __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(0b1111));
+        __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(15));
         __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
         _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);
 
@@ -134,28 +154,34 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) {
         __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
         __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
         __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
-        __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(0b1111));
+        __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(15));
         __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
         _mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides);
 
         nibble_cursor += sizeof(__m128i);
         seq_cursor += 2 * sizeof(__m128i);
     }
-    #endif
-    while (seq_cursor < seq_end_ptr_twoatatime) {
-        // Note size_t cast helps gcc optimiser.
-        memcpy(seq_cursor, code2base + ((size_t)*nibble_cursor * 2), 2);
-        seq_cursor += 2;
-        nibble_cursor += 1;
+    nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
+}
+static void (*nibble2base)(uint8_t *nib, char *seq, int len);
+
+static void nibble2base_dispatch(uint8_t *nib, char *seq, int len) {
+    if (__builtin_cpu_supports("ssse3")) {
+        nibble2base = nibble2base_ssse3;
     }
-    if (seq_cursor != seq_end_ptr) {
-        /* There is a single encoded nuc left */
-        uint8_t nibble_c = *nibble_cursor;
-        uint8_t upper_nuc_index = nibble_c >> 4;
-        seq_cursor[0] = seq_nt16_str[upper_nuc_index];
+    else {
+        nibble2base = nibble2base_default;
     }
+    nibble2base(nib, seq, len);
 }
 
+static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_dispatch;
+
+#else
+static inline void nibble2base(uint8_t *nib, char *seq, int len) {
+    nibble2base_default(nib, seq, len);
+}
+#endif
 #ifdef __cplusplus
 }
 #endif

From 292a35d7c5181c02521aa3bb7dbd5891f1696967 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 22 Apr 2024 13:36:00 +0200
Subject: [PATCH 10/76] Improve compiler compatibility of nibble2base_ssse3

---
 htslib/hts_defs.h | 18 ++++++++++++++++++
 sam_internal.h    |  5 +++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h
index e714e8fda..0c5f8957a 100644
--- a/htslib/hts_defs.h
+++ b/htslib/hts_defs.h
@@ -34,6 +34,10 @@ DEALINGS IN THE SOFTWARE.  */
 #define HTS_COMPILER_HAS(attribute) __has_attribute(attribute)
 #endif
 
+#ifdef __has_builtin
+#define HTS_COMPILER_HAS_BUILTIN(function) __has_builtin(function)
+#endif
+
 #elif defined __GNUC__
 #define HTS_GCC_AT_LEAST(major, minor) \
     (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
@@ -42,6 +46,10 @@ DEALINGS IN THE SOFTWARE.  */
 #ifndef HTS_COMPILER_HAS
 #define HTS_COMPILER_HAS(attribute) 0
 #endif
+#ifndef HTS_COMPILER_HAS_BUILTIN
+#define HTS_COMPILER_HAS_BUILTIN(function) 0
+#endif
+
 #ifndef HTS_GCC_AT_LEAST
 #define HTS_GCC_AT_LEAST(major, minor) 0
 #endif
@@ -118,6 +126,16 @@ DEALINGS IN THE SOFTWARE.  */
 #define HTS_FORMAT(type, idx, first)
 #endif
 
+#define HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
+ ((HTS_COMPILER_HAS(target) && HTS_COMPILER_HAS_BUILTIN(__builtin_cpu_supports)) \
+ || HTS_GCC_AT_LEAST(4, 8))
+
+#if (defined(__x86_64__) || defined(_M_X64))
+#define HTS_BUILD_IS_X86_64 1
+#else
+#define HTS_BUILD_IS_X86_64 0
+#endif
+
 #if defined(_WIN32) || defined(__CYGWIN__)
 #if defined(HTS_BUILDING_LIBRARY)
 #define HTSLIB_EXPORT __declspec(dllexport)
diff --git a/sam_internal.h b/sam_internal.h
index f901070b6..e4c553d58 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -99,7 +99,8 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
         seq[i] = seq_nt16_str[bam_seqi(nib, i)];
 }
 
-#if HTS_GCC_AT_LEAST(4,8)
+#if HTS_BUILD_IS_X86_64 && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS
+#include "immintrin.h"
 /*
  * Convert a nibble encoded BAM sequence to a string of bases.
  *
@@ -108,7 +109,7 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
  * be converted to the IUPAC characters.
  * It falls back on the nibble2base_default function for the remainder.
  */
-#include "tmmintrin.h"
+
 __attribute__((target("ssse3")))
 static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
     seq[0] = 0;

From 9ad8270bd30d7b2d0bb20fa0b533a7abedd1cac7 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Wed, 8 May 2024 03:28:48 +1200
Subject: [PATCH 11/76] Use only _regions_add() when adding the list of contig
 names

Don't use _regions_init_string(), which misinterprets contig names
containing colons as region specification strings. The code used
_regions_init_string() rather than _regions_add() only when needed
to allocate a new bcf_sr_regions_t structure; instead extract basic
initialisation into a new bcf_sr_regions_alloc() function, which as
a bonus checks the memory allocation. Use the new function throughout.

Fixes samtools/bcftools#2179.
---
 synced_bcf_reader.c | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c
index a43ab15ae..9a3488a70 100644
--- a/synced_bcf_reader.c
+++ b/synced_bcf_reader.c
@@ -71,6 +71,7 @@ typedef struct
 }
 aux_t;
 
+static bcf_sr_regions_t *bcf_sr_regions_alloc(void);
 static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end);
 static bcf_sr_regions_t *_regions_init_string(const char *str);
 static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec);
@@ -368,13 +369,22 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
     if ( !files->explicit_regs && !files->streaming )
     {
         int n = 0, i;
-        const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
-        for (i=0; i<n; i++)
+        const char **names;
+
+        if ( !files->regions )
         {
+            files->regions = bcf_sr_regions_alloc();
             if ( !files->regions )
-                files->regions = _regions_init_string(names[i]);
-            else
-                _regions_add(files->regions, names[i], -1, -1);
+            {
+                hts_log_error("Cannot allocate regions data structure");
+                return 0;
+            }
+        }
+
+        names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
+        for (i=0; i<n; i++)
+        {
+            _regions_add(files->regions, names[i], -1, -1);
         }
         free(names);
         _regions_sort_and_merge(files->regions);
@@ -956,6 +966,17 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
     return 1;
 }
 
+// Allocate a new region list structure.
+static bcf_sr_regions_t *bcf_sr_regions_alloc(void)
+{
+    bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
+    if ( !reg ) return NULL;
+
+    reg->start = reg->end = -1;
+    reg->prev_start = reg->prev_end = reg->prev_seq = -1;
+    return reg;
+}
+
 // Add a new region into a list. On input the coordinates are 1-based, inclusive, then stored 0-based,
 // inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions().
 static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end)
@@ -1037,9 +1058,8 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg)
 // wouldn't learn the chromosome name.
 static bcf_sr_regions_t *_regions_init_string(const char *str)
 {
-    bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
-    reg->start = reg->end = -1;
-    reg->prev_start = reg->prev_end = reg->prev_seq = -1;
+    bcf_sr_regions_t *reg = bcf_sr_regions_alloc();
+    if ( !reg ) return NULL;
 
     kstring_t tmp = {0,0,0};
     const char *sp = str, *ep = str;
@@ -1189,9 +1209,8 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr
         return reg;
     }
 
-    reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
-    reg->start = reg->end = -1;
-    reg->prev_start = reg->prev_end = reg->prev_seq = -1;
+    reg = bcf_sr_regions_alloc();
+    if ( !reg ) return NULL;
 
     reg->file = hts_open(regions, "rb");
     if ( !reg->file )

From 02853a921500c552a8dc32c6a3d68e2c00ade20c Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Tue, 30 Apr 2024 15:29:37 +0100
Subject: [PATCH 12/76] Extend annot-tsv, adding several new options

-d, --delim
    .. add support for other delimiters, not just tab

-h, --headers
    .. allow header lines in different formats. This is motivated
       by the common use of files with the first row containing
       column names but without the leading # character.

-I, --no-header-idx
    .. suppress column index numbers in the printed header. When
       given twice, drops the entire header

--help
    .. changed from `-h, --help` to just `--help`. Note this option
       was not advertised in the man page nor the usage page,
       therefore it should be okay to reuse it for `-h, --headers`.
---
 annot-tsv.1                 |  29 ++++++-
 annot-tsv.c                 | 160 +++++++++++++++++++++++++++---------
 test/annot-tsv/dst.11.txt   |   5 ++
 test/annot-tsv/dst.12.txt   |   5 ++
 test/annot-tsv/out.11.1.txt |   3 +
 test/annot-tsv/out.11.2.txt |   4 +
 test/annot-tsv/out.11.3.txt |   4 +
 test/annot-tsv/out.12.1.txt |   3 +
 test/annot-tsv/src.11.txt   |   5 ++
 test/annot-tsv/src.12.txt   |   5 ++
 test/test.pl                |   8 ++
 11 files changed, 193 insertions(+), 38 deletions(-)
 create mode 100644 test/annot-tsv/dst.11.txt
 create mode 100644 test/annot-tsv/dst.12.txt
 create mode 100644 test/annot-tsv/out.11.1.txt
 create mode 100644 test/annot-tsv/out.11.2.txt
 create mode 100644 test/annot-tsv/out.11.3.txt
 create mode 100644 test/annot-tsv/out.12.1.txt
 create mode 100644 test/annot-tsv/src.11.txt
 create mode 100644 test/annot-tsv/src.12.txt

diff --git a/annot-tsv.1 b/annot-tsv.1
index df3b06e91..34e1dd617 100644
--- a/annot-tsv.1
+++ b/annot-tsv.1
@@ -1,7 +1,7 @@
 '\" t
 .TH annot-tsv 1 "15 April 2024" "htslib-1.20" "Bioinformatics tools"
 .\"
-.\" Copyright (C) 2015, 2017-2018, 2023 Genome Research Ltd.
+.\" Copyright (C) 2015, 2017-2018, 2023-2024 Genome Research Ltd.
 .\"
 .\" Author: Petr Danecek
 .\"
@@ -108,6 +108,11 @@ Target file to be extend with annotations from
 Add the same annotations multiple times if multiple overlaps are found
 .RE
 .PP
+.B \-\-help
+.RS 4
+This help message
+.RE
+.PP
 .BR \-\-max\-annots " INT"
 .RS 4
 Add at most INT annotations per column to save time when many overlaps are found with a single region
@@ -138,11 +143,33 @@ number of source base pairs in the overlap
 .RE
 .RE
 .PP
+.BR \-d ", " \-\-delim " SRC:TGT"
+.RS 4
+Column delimiter in the source and the target file. For example, if both files are comma-delimited, run with
+"--delim ,:," or simply "--delim ,". If the source file is comma-delimited and the target file is tab-delimited,
+run with "-d $',:\\t'".
+.RE
+.PP
+.BR \-h ", " \-\-headers " SRC:TGT"
+.RS 4
+Line number of the header row with column names. By default the first line is interpreted as header if it starts with the comment
+character ("#"), otherwise expects numeric indices. However, if the first line does not start with "#" but still
+contains the column names, use "--headers 1:1". To ignore existing header (skip comment lines) and use numeric indices,
+use "--headers 0:0" which is equivalent to "--ignore-headers". When negative value is given, it is interpreted as the number of
+lines from the end of the comment block. Specifically, "--headers -1" takes the column names from the last line of
+the comment block (e.g., the "#CHROM" line in the VCF format).
+.RE
+.PP
 .BR \-H ", " \-\-ignore\-headers
 .RS 4
 Ignore the headers completely and use numeric indexes even when a header exists
 .RE
 .PP
+.BR \-I ", " \-\-no\-hdr\-idx
+.RS 4
+Suppress index numbers in the printed header. If given twice, drop the entire header.
+.RE
+.PP
 .BR \-O ", " \-\-overlap " FLOAT"
 .RS 4
 Minimum overlap as a fraction of region length in at least one of the overlapping regions. If also
diff --git a/annot-tsv.c b/annot-tsv.c
index 4661e6e0f..e5e3077eb 100644
--- a/annot-tsv.c
+++ b/annot-tsv.c
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2018-2023 Genome Research Ltd.
+    Copyright (C) 2018-2024 Genome Research Ltd.
 
     Author: Petr Danecek <pd3@sanger.ac.uk>
 
@@ -71,6 +71,7 @@ typedef struct
     cols_t *core, *match, *transfer, *annots;
     int *core_idx, *match_idx, *transfer_idx, *annots_idx;
     int *nannots_added; // for --max-annots: the number of annotations added
+    char delim;
     int grow_n;
     kstring_t line;     // one buffered line, a byproduct of reading the header
     htsFile *fp;
@@ -100,10 +101,10 @@ typedef struct
 {
     nbp_t *nbp;
     dat_t dst, src;
-    char *core_str, *match_str, *transfer_str, *annots_str;
+    char *core_str, *match_str, *transfer_str, *annots_str, *headers_str, *delim_str;
     char *temp_dir, *out_fname;
     BGZF *out_fp;
-    int allow_dups, reciprocal, ignore_headers, max_annots, mode;
+    int allow_dups, reciprocal, max_annots, mode, no_write_hdr;
     double overlap;
     regidx_t *idx;
     regitr_t *itr;
@@ -282,7 +283,7 @@ int parse_tab_with_payload(const char *line, char **chr_beg, char **chr_end, hts
 
     dat_t *dat = (dat_t*) usr;
 
-    cols_t *cols = cols_split(line, NULL, '\t');
+    cols_t *cols = cols_split(line, NULL, dat->delim);
     *((cols_t**)payload) = cols;
 
     if ( cols->n < dat->core_idx[0] ) error("Expected at least %d columns, found %d: %s\n",dat->core_idx[0]+1,cols->n,line);
@@ -315,47 +316,90 @@ void free_payload(void *payload)
     cols_destroy(cols);
 }
 
-// Parse header if present (first line has a leading #) or create a dummy header with
-// numeric column names. If dummy is set, read first data line (without a leading #)
-// and create a dummy header.
-void parse_header(dat_t *dat, char *fname, int dummy)
+// Parse header if present, the parameter irow indicates the header row line number:
+//      0   .. ignore headers, create numeric fields names, 1-based indices
+//      N>0 .. N-th line, all previous lines are discarded
+//      N<0 .. N-th line from the end of the comment block (comment lines are prefixed with #),
+//             all preceding lines are discarded.
+// When autodetect is set, the argument nth_row is ignored.
+// Note this makes no attempt to preserve comment lines on output
+void parse_header(dat_t *dat, char *fname, int nth_row, int autodetect)
 {
     dat->fp = hts_open(fname,"r");
     if ( !dat->fp ) error("Failed to open: %s\n", fname);
 
+    // buffer comment lines when N<0
+    int nbuf = 0;
+    char **buf = NULL;
+    if ( nth_row < 0 )
+    {
+        buf = calloc(-nth_row,sizeof(*buf));
+        if ( !buf ) error("Out of memory, failed to allocate %zu bytes\n",(-nth_row)*sizeof(*buf));
+    }
+
+    int irow = 0;
     cols_t *cols = NULL;
     while ( hts_getline(dat->fp, KS_SEP_LINE, &dat->line) > 0 )
     {
-        if ( dat->line.s[0]=='#' )
+        if ( autodetect )
+        {
+            // if the first line is comment line, use it as a header. Otherwise go
+            // with numeric indices
+            nth_row = dat->line.s[0]=='#' ? 1 : 0;
+            break;
+        }
+        if ( nth_row==0 )
         {
-            // this is a header or comment line
-            if ( dummy ) continue;
-            cols = cols_split(dat->line.s, NULL, '\t');
+            // N=0 .. comment lines to be ignored, read until we get to the first data line
+            if ( dat->line.s[0]=='#' ) continue;
             break;
         }
+        if ( nth_row>0 )
+        {
+            // N>1 .. regardless of this being a comment or data line, read until Nth line
+            if ( ++irow < nth_row ) continue;
+            break;
+        }
+        // N<0 .. keep abs(N) comment lines in a sliding buffer
+        if ( dat->line.s[0]!='#' ) break;   // data line
+        if ( nbuf == -nth_row )
+        {
+            // one more comment line and the buffer is full. We could use round buffer
+            // for efficiency, but the assumption is abs(nth_row) is small
+            memmove(buf, &buf[1], (nbuf-1)*sizeof(*buf));
+            nbuf--;
+        }
+        buf[nbuf++] = strdup(dat->line.s);
+    }
 
-        // this a data line, we must be in a dummy mode
-        cols = cols_split(dat->line.s, NULL, '\t');
-        assert(cols && cols->n);
-        assert(cols->off[0][0] != '#');
+    if ( nth_row < 0 )
+    {
+        if ( nbuf!=-nth_row )
+            error("Found %d header lines in %s, cannot fetch N=%d from the end\n",nbuf,fname,-nth_row);
+        cols = cols_split(buf[0], NULL, dat->delim);
 
+    }
+    else
+        cols = cols_split(dat->line.s, NULL, dat->delim);
+
+    if ( !dat->line.l ) error("Failed to read: %s\n", fname);
+    assert(cols && cols->n);
+
+    if ( nth_row == 0 ) // create numeric indices
+    {
         // create a dummy header with numeric field names
         kstring_t str = {0,0,0};
         int i, n = cols->n;
         for (i=0; i<n; i++)
         {
-            if ( i>0 ) kputc('\t', &str);
+            if ( i>0 ) kputc(dat->delim, &str);
             kputw(i+1, &str);
         }
         cols_destroy(cols);
-        cols = cols_split(str.s, NULL, '\t');
+        cols = cols_split(str.s, NULL, dat->delim);
         free(str.s);
         dat->hdr.dummy = 1;
-
-        break;
     }
-    if ( !dat->line.l ) error("Failed to read: %s\n", fname);
-    assert(cols && cols->n);
 
     dat->hdr.name2idx = khash_str2int_init();
     int i;
@@ -377,24 +421,28 @@ void parse_header(dat_t *dat, char *fname, int dummy)
     }
     dat->hdr.cols = cols;
     if ( !dat->hdr.dummy ) dat->line.l = 0;
+
+    for (i=0; i<nbuf; i++) free(buf[i]);
+    free(buf);
 }
 void write_header(args_t *args, dat_t *dat)
 {
     if ( dat->hdr.dummy ) return;
+    if ( args->no_write_hdr>1 ) return;
     int i;
     kstring_t str = {0,0,0};
     kputc('#', &str);
     for (i=0; i<dat->hdr.cols->n; i++)
     {
-        if ( i>0 ) kputc('\t', &str);
-        ksprintf(&str,"[%d]", i+1);
+        if ( i>0 ) kputc(dat->delim, &str);
+        if ( !args->no_write_hdr ) ksprintf(&str,"[%d]", i+1);
         kputs(dat->hdr.cols->off[i], &str);
     }
     if ( dat->hdr.annots )
     {
         for (i=0; i<dat->hdr.annots->n; i++)
         {
-            if ( str.l > 1 ) kputc('\t', &str);
+            if ( str.l > 1 ) kputc(dat->delim, &str);
             kputs(dat->hdr.annots->off[i], &str);
         }
     }
@@ -434,8 +482,30 @@ void sanity_check_columns(char *fname, hdr_t *hdr, cols_t *cols, int **col2idx,
 }
 void init_data(args_t *args)
 {
-    parse_header(&args->dst, args->dst.fname, args->ignore_headers);
-    parse_header(&args->src, args->src.fname, args->ignore_headers);
+    if ( !args->delim_str )
+        args->dst.delim = args->src.delim = '\t';
+    else if ( strlen(args->delim_str)==1 )
+        args->dst.delim = args->src.delim = *args->delim_str;
+    else if ( strlen(args->delim_str)==3 && args->delim_str[1]==':' )
+        args->src.delim = args->delim_str[0], args->dst.delim = args->delim_str[2];
+    else
+        error("Could not parse the option --delim %s\n",args->delim_str);
+
+    // --headers, determine header row index
+    int isrc = 0, idst = 0, autodetect = 1;
+    if ( args->headers_str )
+    {
+        cols_t *tmp = cols_split(args->headers_str, NULL, ':');
+        char *rmme;
+        isrc = strtol(tmp->off[0],&rmme,10);
+        if ( *rmme || tmp->off[0]==rmme ) error("Could not parse the option --headers %s\n",args->headers_str);
+        idst = strtol(tmp->n==2 ? tmp->off[1] : tmp->off[0],&rmme,10);
+        if ( *rmme || (tmp->n==2 ? tmp->off[1] : tmp->off[0])==rmme ) error("Could not parse the option --headers %s\n",args->headers_str);
+        cols_destroy(tmp);
+        autodetect = 0;
+    }
+    parse_header(&args->dst, args->dst.fname, idst, autodetect);
+    parse_header(&args->src, args->src.fname, isrc, autodetect);
 
     // -c, core columns
     if ( !args->core_str ) args->core_str = "chr,beg,end:chr,beg,end";
@@ -608,17 +678,17 @@ static void write_annots(args_t *args)
     {
         if ( args->dst.annots_idx[i]==ANN_NBP )
         {
-            kputc('\t',&args->tmp_kstr);
+            kputc(args->dst.delim,&args->tmp_kstr);
             kputw(len,&args->tmp_kstr);
         }
         else if ( args->dst.annots_idx[i]==ANN_FRAC )
         {
-            kputc('\t',&args->tmp_kstr);
+            kputc(args->dst.delim,&args->tmp_kstr);
             kputd((double)len/(args->nbp->end - args->nbp->beg + 1),&args->tmp_kstr);
         }
         else if ( args->dst.annots_idx[i]==ANN_CNT )
         {
-            kputc('\t',&args->tmp_kstr);
+            kputc(args->dst.delim,&args->tmp_kstr);
             kputw(args->nbp->n/2,&args->tmp_kstr);
         }
     }
@@ -758,7 +828,7 @@ void process_line(args_t *args, char *line, size_t size)
     write_string(args, dst_cols->off[0], 0);
     for (i=1; i<dst_cols->n; i++)
     {
-        write_string(args, "\t", 1);
+        write_string(args, &args->dst.delim, 1);
         write_string(args, dst_cols->off[i], 0);
     }
     write_annots(args);
@@ -796,6 +866,7 @@ static const char *usage_text(void)
         "\n"
         "Other options:\n"
         "       --allow-dups        Add annotations multiple times\n"
+        "       --help              This help message\n"
         "       --max-annots INT    Adding at most INT annotations per column to save\n"
         "                           time in big regions\n"
         "       --version           Print version string and exit\n"
@@ -804,7 +875,12 @@ static const char *usage_text(void)
         "                             frac .. fraction of the target region with an\n"
         "                                       overlap\n"
         "                             nbp  .. number of source base pairs in the overlap\n"
-        "   -H, --ignore-headers    Use numeric indexes, ignore the headers completely\n"
+        "   -d, --delim SRC:TGT     Column delimiter in SRC and TGT file\n"
+        "   -h, --headers SRC:TGT   Header row line number, 0:0 is equivalent to -H, negative\n"
+        "                             value counts from the end of comment line block [1:1]\n"
+        "   -H, --ignore-headers    Use numeric indices, ignore the headers completely\n"
+        "   -I, --no-header-idx     Suppress index numbers in the printed header. If given\n"
+        "                           twice, drop the entire header\n"
         "   -O, --overlap FLOAT     Minimum required overlap (non-reciprocal, unless -r\n"
         "                           is given)\n"
         "   -r, --reciprocal        Apply the -O requirement to both overlapping\n"
@@ -847,18 +923,21 @@ int main(int argc, char **argv)
         {"target-file",required_argument,NULL,'t'},
         {"allow-dups",no_argument,NULL,0},
         {"max-annots",required_argument,NULL,2},
+        {"no-header-idx",required_argument,NULL,'I'},
         {"version",no_argument,NULL,1},
         {"annotate",required_argument,NULL,'a'},
+        {"headers",no_argument,NULL,'h'},
         {"ignore-headers",no_argument,NULL,'H'},
         {"overlap",required_argument,NULL,'O'},
         {"reciprocal",no_argument,NULL,'r'},
         {"drop-overlaps",no_argument,NULL,'x'},
-        {"help",no_argument,NULL,'h'},
+        {"delim",required_argument,NULL,'d'},
+        {"help",no_argument,NULL,4},
         {NULL,0,NULL,0}
     };
     char *tmp = NULL;
     int c;
-    while ((c = getopt_long(argc, argv, "hc:f:m:o:s:t:a:HO:rx",loptions,NULL)) >= 0)
+    while ((c = getopt_long(argc, argv, "c:f:m:o:s:t:a:HO:rxh:Id:",loptions,NULL)) >= 0)
     {
         switch (c)
         {
@@ -873,7 +952,10 @@ int main(int argc, char **argv)
                 args->max_annots = strtod(optarg, &tmp);
                 if ( tmp==optarg || *tmp ) error("Could not parse --max-annots  %s\n", optarg);
                 break;
-            case 'H': args->ignore_headers = 1; break;
+            case 'I': args->no_write_hdr++; break;
+            case 'd': args->delim_str = optarg; break;
+            case 'h': args->headers_str = optarg; break;
+            case 'H': args->headers_str = "0:0"; break;
             case 'r': args->reciprocal = 1; break;
             case 'c': args->core_str  = optarg; break;
             case 't': args->dst.fname = optarg; break;
@@ -888,7 +970,7 @@ int main(int argc, char **argv)
             case 's': args->src.fname = optarg; break;
             case 'f': args->transfer_str = optarg; break;
             case 'x': args->mode = PRINT_NONMATCHING; break;
-            case 'h': printf("\nVersion: %s\n%s\n",hts_version(),usage_text()); exit(EXIT_SUCCESS); break;
+            case  4 : printf("\nVersion: %s\n%s\n",hts_version(),usage_text()); exit(EXIT_SUCCESS); break;
             case '?': // fall through
             default: error("\nVersion: %s\n%s\n",hts_version(),usage_text()); break;
         }
@@ -914,7 +996,11 @@ int main(int argc, char **argv)
     while ( read_next_line(&args->dst) )
     {
         int i;
-        for (i=0; i<args->dst.grow_n; i++) kputs("\t.", &args->dst.line);
+        for (i=0; i<args->dst.grow_n; i++)
+        {
+            kputc(args->dst.delim, &args->dst.line);
+            kputc('.', &args->dst.line);
+        }
         process_line(args, args->dst.line.s, args->dst.line.l);
         args->dst.line.l = 0;
     }
diff --git a/test/annot-tsv/dst.11.txt b/test/annot-tsv/dst.11.txt
new file mode 100644
index 000000000..c54ad153a
--- /dev/null
+++ b/test/annot-tsv/dst.11.txt
@@ -0,0 +1,5 @@
+#ignore me
+#chr	beg	end	smpl
+1	10	20	A
+1	30	40	A
+1	50	60	A
diff --git a/test/annot-tsv/dst.12.txt b/test/annot-tsv/dst.12.txt
new file mode 100644
index 000000000..9b26b79af
--- /dev/null
+++ b/test/annot-tsv/dst.12.txt
@@ -0,0 +1,5 @@
+#ignore me
+#chr,beg,end,smpl
+1,10,20,A
+1,30,40,A
+1,50,60,A
diff --git a/test/annot-tsv/out.11.1.txt b/test/annot-tsv/out.11.1.txt
new file mode 100644
index 000000000..3de1f68ee
--- /dev/null
+++ b/test/annot-tsv/out.11.1.txt
@@ -0,0 +1,3 @@
+1	10	20	A	A
+1	30	40	A	B
+1	50	60	A	.
diff --git a/test/annot-tsv/out.11.2.txt b/test/annot-tsv/out.11.2.txt
new file mode 100644
index 000000000..a863f4e61
--- /dev/null
+++ b/test/annot-tsv/out.11.2.txt
@@ -0,0 +1,4 @@
+#[1]chr	[2]beg	[3]end	[4]smpl	[5]src_smpl
+1	10	20	A	A
+1	30	40	A	B
+1	50	60	A	.
diff --git a/test/annot-tsv/out.11.3.txt b/test/annot-tsv/out.11.3.txt
new file mode 100644
index 000000000..7a37130db
--- /dev/null
+++ b/test/annot-tsv/out.11.3.txt
@@ -0,0 +1,4 @@
+#chr	beg	end	smpl	src_smpl
+1	10	20	A	A
+1	30	40	A	B
+1	50	60	A	.
diff --git a/test/annot-tsv/out.12.1.txt b/test/annot-tsv/out.12.1.txt
new file mode 100644
index 000000000..7b6d0e994
--- /dev/null
+++ b/test/annot-tsv/out.12.1.txt
@@ -0,0 +1,3 @@
+1,10,20,A,A
+1,30,40,A,B
+1,50,60,A,.
diff --git a/test/annot-tsv/src.11.txt b/test/annot-tsv/src.11.txt
new file mode 100644
index 000000000..26eb20be6
--- /dev/null
+++ b/test/annot-tsv/src.11.txt
@@ -0,0 +1,5 @@
+#ignore me
+#chr1	beg1	end1	smpl1
+#chr2	beg2	end2	smpl2
+1	10	20	A
+1	30	40	B
diff --git a/test/annot-tsv/src.12.txt b/test/annot-tsv/src.12.txt
new file mode 100644
index 000000000..9b7ac367c
--- /dev/null
+++ b/test/annot-tsv/src.12.txt
@@ -0,0 +1,5 @@
+#ignore me
+#chr1,beg1,end1,smpl1
+#chr2,beg2,end2,smpl2
+1,10,20,A
+1,30,40,B
diff --git a/test/test.pl b/test/test.pl
index 03eca1129..5d298bc02 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -1414,4 +1414,12 @@ sub test_annot_tsv
     run_annot_tsv($opts,src=>'src.10.txt',dst=>'dst.10.txt',out=>'out.10.4.txt',args=>'-m smpl -f smpl');
     run_annot_tsv($opts,src=>'src.10.txt',dst=>'dst.10.txt',out=>'out.10.5.txt',args=>'-m smpl ');
     run_annot_tsv($opts,src=>'src.10.txt',dst=>'dst.10.txt',out=>'out.10.6.txt',args=>'-m smpl -x');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2 -II');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.2.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.2.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2 -I');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2 -I');
+    run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.12.txt',out=>'out.12.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d ,');
+    run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>q[-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d $',:\t']);
 }

From 80bf6b8967f7bbb8e33cd0a6515dbf84b77990fe Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 9 May 2024 11:54:48 +0100
Subject: [PATCH 13/76] Fix a memory leak

---
 annot-tsv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/annot-tsv.c b/annot-tsv.c
index e5e3077eb..33b19e030 100644
--- a/annot-tsv.c
+++ b/annot-tsv.c
@@ -366,6 +366,7 @@ void parse_header(dat_t *dat, char *fname, int nth_row, int autodetect)
         {
             // one more comment line and the buffer is full. We could use round buffer
             // for efficiency, but the assumption is abs(nth_row) is small
+            free(buf[0]);
             memmove(buf, &buf[1], (nbuf-1)*sizeof(*buf));
             nbuf--;
         }

From b204d55c88008ee2b1ef1267e30efa99842e0277 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Thu, 9 May 2024 16:46:29 +0100
Subject: [PATCH 14/76] Make sure `-h -1` does not loose the first line

---
 annot-tsv.c  | 6 ++++--
 test/test.pl | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/annot-tsv.c b/annot-tsv.c
index 33b19e030..cbbf165e3 100644
--- a/annot-tsv.c
+++ b/annot-tsv.c
@@ -373,12 +373,13 @@ void parse_header(dat_t *dat, char *fname, int nth_row, int autodetect)
         buf[nbuf++] = strdup(dat->line.s);
     }
 
+    int keep_line = 0;
     if ( nth_row < 0 )
     {
         if ( nbuf!=-nth_row )
             error("Found %d header lines in %s, cannot fetch N=%d from the end\n",nbuf,fname,-nth_row);
         cols = cols_split(buf[0], NULL, dat->delim);
-
+        keep_line = 1;
     }
     else
         cols = cols_split(dat->line.s, NULL, dat->delim);
@@ -400,6 +401,7 @@ void parse_header(dat_t *dat, char *fname, int nth_row, int autodetect)
         cols = cols_split(str.s, NULL, dat->delim);
         free(str.s);
         dat->hdr.dummy = 1;
+        keep_line = 1;
     }
 
     dat->hdr.name2idx = khash_str2int_init();
@@ -421,7 +423,7 @@ void parse_header(dat_t *dat, char *fname, int nth_row, int autodetect)
         khash_str2int_set(dat->hdr.name2idx, cols->off[i], i);
     }
     dat->hdr.cols = cols;
-    if ( !dat->hdr.dummy ) dat->line.l = 0;
+    if ( !keep_line ) dat->line.l = 0;
 
     for (i=0; i<nbuf; i++) free(buf[i]);
     free(buf);
diff --git a/test/test.pl b/test/test.pl
index 5d298bc02..9beed91bb 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -1416,6 +1416,7 @@ sub test_annot_tsv
     run_annot_tsv($opts,src=>'src.10.txt',dst=>'dst.10.txt',out=>'out.10.6.txt',args=>'-m smpl -x');
     run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0');
     run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2 -II');
+    run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:-1 -II');
     run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.2.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2');
     run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.2.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2');
     run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr1,beg1,end1:chr,beg,end -f smpl1:src_smpl -h 2:2 -I');

From 667f14f48da6c6a135a24eb60df0b31b8a017782 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Thu, 9 May 2024 16:48:42 +0100
Subject: [PATCH 15/76] Add check for cpuid (for htscodecs)

Updates the htscodecs submodule to commit a7ce404:
 * Add configure checks for cpuid symbols

Adds similar configure checks to HTSlib, along with adjustments
to the Makefile and hts_probe_cc.sh for people who like to run
"make" without "./configure".  For both, if the cpuid check
fails, the SSE and AVX tests will be skipped as the features
they test for would not be used even if the corresponding code
is built.
---
 Makefile        |  4 ++++
 configure.ac    | 11 +++++++++++
 hts_probe_cc.sh | 27 +++++++++++++++++++++++++++
 htscodecs       |  2 +-
 4 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 99142c865..28c84bdcc 100644
--- a/Makefile
+++ b/Makefile
@@ -278,6 +278,10 @@ config.h:
 	echo '#endif' >> $@
 	echo '#define HAVE_DRAND48 1' >> $@
 	echo '#define HAVE_LIBCURL 1' >> $@
+	if [ "x$(HTS_HAVE_CPUID)" != "x" ]; then \
+	    echo '#define HAVE_DECL___CPUID_COUNT 1' >> $@ ; \
+	    echo '#define HAVE_DECL___GET_CPUID_MAX 1' >> $@ ; \
+	fi
 	if [ "x$(HTS_BUILD_SSE4)" != "x" ]; then \
 	    echo '#define HAVE_POPCNT 1' >> $@ ; \
 	    echo '#define HAVE_SSE4_1 1' >> $@ ; \
diff --git a/configure.ac b/configure.ac
index 49f2cbc70..5de64dc90 100644
--- a/configure.ac
+++ b/configure.ac
@@ -82,6 +82,14 @@ AC_CHECK_DECL([_XOPEN_SOURCE], [],
   [AC_DEFINE([_XOPEN_SOURCE], [600], [Specify X/Open requirements])],
   [])
 
+dnl Check that we have cpuid, and if so run the x86 SIMD checks
+AC_CHECK_DECLS([__get_cpuid_max, __cpuid_count], [
+   hts_have_cpuid=yes
+], [
+   hts_have_cpuid=no
+], [[#include <cpuid.h>]])
+
+AS_IF(test "x$hts_have_cpuid" = "xyes", [
 dnl Options for rANS32x16 sse4.1 version - sse4.1
 HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1 -mssse3 -mpopcnt],
  [AC_LANG_PROGRAM([[
@@ -100,6 +108,7 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1 -mssse3 -mpopcnt],
   AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.])
   AC_DEFINE([HAVE_SSE4_1],1,[Defined to 1 if rANS source using SSE4.1 can be compiled.
 ])
+
 dnl Propagate HTSlib's unaligned access preference to htscodecs
   AH_VERBATIM([UBSAN],[
 /* Prevent unaligned access in htscodecs SSE4 rANS codec */
@@ -148,6 +157,8 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt],
   AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.])
 ])
 
+]) dnl End of AS_IF(hts_have_cpuid)
+
 dnl Avoid chicken-and-egg problem where pkg-config supplies the
 dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check
 dnl for pkg-config...
diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh
index 48d0159c6..cbbee3990 100755
--- a/hts_probe_cc.sh
+++ b/hts_probe_cc.sh
@@ -51,6 +51,12 @@ run_compiler ()
 # again with it to see if the flag is needed.
 run_test ()
 {
+    if [ $have_cpuid -ne 1 ] ; then
+        # Only test for and build SSE / AVX code if cpuid works as
+        # otherwise it won't be executed, even if present
+        echo "$3 ="
+        return
+    fi
     rm -f conftest conftest.err conftest.c
     cat - > conftest.c
     if run_compiler ; then
@@ -66,6 +72,27 @@ run_test ()
 
 echo "# Compiler probe results, generated by $0"
 
+# Check for cpuid
+rm -f conftest conftest.err conftest.c
+cat > conftest.c <<'EOF'
+#include <cpuid.h>
+#include <stddef.h>
+int main(int argc, char **argv) {
+    unsigned int a, b, c, d;
+    int level = __get_cpuid_max(0, NULL);
+    if (level > 0)
+        __cpuid_count(1, 0, a, b, c, d);
+    return 0;
+}
+EOF
+if run_compiler ; then
+    echo "HTS_HAVE_CPUID = 1"
+    have_cpuid=1
+else
+    echo "HTS_HAVE_CPUID ="
+    have_cpuid=0
+fi
+
 # Check for sse4.1 etc. support
 run_test "-msse4.1 -mpopcnt -mssse3" HTS_CFLAGS_SSE4 HTS_BUILD_SSE4 <<'EOF'
 #ifdef __x86_64__
diff --git a/htscodecs b/htscodecs
index 3865c8820..a7ce40498 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit 3865c88208d8e667bcc4d2bfd49541074b797d03
+Subproject commit a7ce4049871ac2adecc6a1d2e78e7b54fa42a222

From 75767a308ccec058454ffb5d64e4f32eacea9239 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 13 May 2024 15:33:52 +0100
Subject: [PATCH 16/76] Test for more avx512 functions

Updates the htscodecs submodule to commit 998699d:
 * Test for more avx512 functions

Some compiler installations (notably xcode for MacOS El Capitan)
are missing some of the avx512f intrinsics.  As these were not
specifically checked for in configure, it enabled avx512 but
builds failed due to the missing symbols.  Fix by adding some
extra lines to the avx512f configure test so it can turn avx512
off on these platforms.
---
 configure.ac    | 4 +++-
 hts_probe_cc.sh | 4 +++-
 htscodecs       | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index 5de64dc90..39c8ee561 100644
--- a/configure.ac
+++ b/configure.ac
@@ -148,7 +148,9 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt],
     #ifdef __x86_64__
     __m512i a = _mm512_set1_epi32(1);
     __m512i b = _mm512_add_epi32(a, a);
-    return _mm_popcnt_u32(*((char *) &b));
+    __m256i c = _mm512_castsi512_si256(b);
+    __m256i d = _mm512_extracti64x4_epi64(a, 1);
+    return _mm_popcnt_u32(*((char *) &c)) + (*(char *) &d);
     #endif
   ]])], [
   hts_cflags_avx512="$flags_needed"
diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh
index cbbee3990..eaa19470e 100755
--- a/hts_probe_cc.sh
+++ b/hts_probe_cc.sh
@@ -131,7 +131,9 @@ run_test "-mavx512f -mpopcnt" HTS_CFLAGS_AVX512 HTS_BUILD_AVX512 <<'EOF'
 int main(int argc, char **argv) {
     __m512i a = _mm512_set1_epi32(1);
     __m512i b = _mm512_add_epi32(a, a);
-    return _mm_popcnt_u32(*((char *) &b));
+    __m256i c = _mm512_castsi512_si256(b);
+    __m256i d = _mm512_extracti64x4_epi64(a, 1);
+    return _mm_popcnt_u32(*((char *) &c)) + (*(char *) &d);
 }
 #else
 int main(int argc, char **argv) { return 0; }
diff --git a/htscodecs b/htscodecs
index a7ce40498..998699daa 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit a7ce4049871ac2adecc6a1d2e78e7b54fa42a222
+Subproject commit 998699daaade6222f70b61a554bf1f770cc337fc

From 58ef9ec14b487bf80230ae04980b501ce7ef5ea2 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 13 May 2024 15:35:14 +0100
Subject: [PATCH 17/76] Check that __builtin_cpu_supports("ssse3") works

This builtin doesn't understand ssse3 on some old versions of
clang.  To allow builds using these compilers, add a configure
check to ensure it works.

For make-only builds, we currently assume it does for X86
platforms.  As the problem only affected clang 3.7 and 3.8, both
of which are very old now, it seems reasonable to require use
of configure to support them.
---
 Makefile       |  3 +++
 configure.ac   | 14 ++++++++++++++
 sam_internal.h |  4 +++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 28c84bdcc..54bca03d0 100644
--- a/Makefile
+++ b/Makefile
@@ -296,6 +296,9 @@ config.h:
 	if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \
 	    echo '#define HAVE_AVX512 1' >> $@ ; \
 	fi
+	echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@
+	echo '#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1' >> $@
+	echo '#endif' >> $@
 
 # And similarly for htslib.pc.tmp ("pkg-config template").  No dependency
 # on htslib.pc.in listed, as if that file is newer the usual way to regenerate
diff --git a/configure.ac b/configure.ac
index 39c8ee561..13d91d218 100644
--- a/configure.ac
+++ b/configure.ac
@@ -159,6 +159,20 @@ HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f -mpopcnt],
   AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.])
 ])
 
+dnl Check for working __builtin_cpu_supports (ssse3 is broken on some clangs)
+AC_MSG_CHECKING([for working __builtin_cpu_supports("ssse3")])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([],[
+  if (__builtin_cpu_supports("ssse3")) {
+    return 0;
+  }
+])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_BUILTIN_CPU_SUPPORT_SSSE3], 1,
+            [Defined to 1 if __builtin_cpu_supports("ssse3") works])
+], [
+  AC_MSG_RESULT([no])
+])
+
 ]) dnl End of AS_IF(hts_have_cpuid)
 
 dnl Avoid chicken-and-egg problem where pkg-config supplies the
diff --git a/sam_internal.h b/sam_internal.h
index e4c553d58..9457b57cf 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -99,7 +99,9 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
         seq[i] = seq_nt16_str[bam_seqi(nib, i)];
 }
 
-#if HTS_BUILD_IS_X86_64 && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS
+#if HTS_BUILD_IS_X86_64 \
+    && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
+    && HAVE_BUILTIN_CPU_SUPPORT_SSSE3
 #include "immintrin.h"
 /*
  * Convert a nibble encoded BAM sequence to a string of bases.

From 6b0a9e97da8a0772e012379b01296685b54eaa5d Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Thu, 16 May 2024 13:38:20 +0100
Subject: [PATCH 18/76] Update htscodecs submodule to commit 5a2627e

 * Avoid typedef disabled by setting _XOPEN_SOURCE
---
 htscodecs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htscodecs b/htscodecs
index 998699daa..5a2627ec4 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit 998699daaade6222f70b61a554bf1f770cc337fc
+Subproject commit 5a2627ec4d6bf0c96b0d7e85a0b031e3ce80f8c1

From 30c9c50a874059e3dae7ff8c0ad9e8a9258031c8 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Fri, 8 Jul 2022 18:03:06 +0100
Subject: [PATCH 19/76] Warn if bgzf_getline() returned apparently
 UTF-16-encoded text

Text files badly transferred from Windows may occasionally be
UTF-16-encoded, and this may not be easily noticed by the user.
HTSlib should not accept such encoding (as other tools surely don't,
hence doing so would cause interoperability problems), but it should
ideally emit a warning or error message identifying the problem.

Reading text from a htsFile/samFile/vcfFile will already have failed
with EFTYPE/ENOEXEC if the text file is UTF-16-encoded, as the encoding
will not have been recognised by hts_detect_format().

OTOH bgzf_getline() will return a UTF-16-encoded text line. Add a
suitable context-dependent diagnostic to the BGZF-based bgzf_getline()
calls in HTSlib: in hts_readlist()/hts_readlines(), emit a warning
(once, on the first line); in tbx.c, emit a more specific error message
if get_intv() parsing failure is due to UTF-16 encoding.

[TODO] If utf16_text_format were added to htsFormatCategory,
the new is_utf16_text() function is suitable for detecting it.
---
 hts.c          | 31 +++++++++++++++++++++++++++++++
 hts_internal.h |  3 +++
 tbx.c          |  7 +++++--
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/hts.c b/hts.c
index 1021cf748..fc745cd95 100644
--- a/hts.c
+++ b/hts.c
@@ -431,6 +431,27 @@ static int is_text_only(const unsigned char *u, const unsigned char *ulim)
     return 1;
 }
 
+static inline int
+alternate_zeros(const unsigned char *u, const unsigned char *ulim)
+{
+    for (; u < ulim; u += 2)
+        if (*u != '\0') return 0;
+    return 1;
+}
+
+static int is_utf16_text(const unsigned char *u, const unsigned char *ulim)
+{
+    if (ulim - u >= 6 &&
+        ((u[0] == 0xfe && u[1] == 0xff && alternate_zeros(u+2, ulim)) ||
+         (u[0] == 0xff && u[1] == 0xfe && alternate_zeros(u+3, ulim))))
+        return 2;
+    else if (ulim - u >= 8 &&
+             (alternate_zeros(u, ulim) || alternate_zeros(u+1, ulim)))
+        return 1;
+    else
+        return 0;
+}
+
 static int is_fastaq(const unsigned char *u, const unsigned char *ulim)
 {
     const unsigned char *eol = memchr(u, '\n', ulim - u);
@@ -1961,6 +1982,12 @@ hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname)
     return fp;
 }
 
+int hts_is_utf16_text(const kstring_t *str)
+{
+    const unsigned char *u = (const unsigned char *) (str->s);
+    return (str->l > 0 && str->s)? is_utf16_text(u, u + str->l) : 0;
+}
+
 // For VCF/BCF backward sweeper. Not exposing these functions because their
 // future is uncertain. Things will probably have to change with hFILE...
 BGZF *hts_get_bgzfp(htsFile *fp)
@@ -2030,6 +2057,8 @@ char **hts_readlist(const char *string, int is_file, int *_n)
         while ((ret = bgzf_getline(fp, '\n', &str)) >= 0)
         {
             if (str.l == 0) continue;
+            if (n == 0 && hts_is_utf16_text(&str))
+                hts_log_warning("'%s' appears to be encoded as UTF-16", string);
             if (hts_resize(char*, n + 1, &m, &s, 0) < 0)
                 goto err;
             s[n] = strdup(str.s);
@@ -2089,6 +2118,8 @@ char **hts_readlines(const char *fn, int *_n)
         str.s = 0; str.l = str.m = 0;
         while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
             if (str.l == 0) continue;
+            if (n == 0 && hts_is_utf16_text(&str))
+                hts_log_warning("'%s' appears to be encoded as UTF-16", fn);
             if (hts_resize(char *, n + 1, &m, &s, 0) < 0)
                 goto err;
             s[n] = strdup(str.s);
diff --git a/hts_internal.h b/hts_internal.h
index 191a55d16..70abbb29c 100644
--- a/hts_internal.h
+++ b/hts_internal.h
@@ -87,6 +87,9 @@ typedef struct hts_cram_idx_t {
     struct cram_fd *cram;
 } hts_cram_idx_t;
 
+// Determine whether the string's contents appear to be UTF-16-encoded text.
+// Returns 1 if they are, 2 if there is also a BOM, or 0 otherwise.
+int hts_is_utf16_text(const kstring_t *str);
 
 // Entry point to hFILE_multipart backend.
 struct hFILE *hopen_htsget_redirect(struct hFILE *hfile, const char *mode);
diff --git a/tbx.c b/tbx.c
index ade2e9f09..662500549 100644
--- a/tbx.c
+++ b/tbx.c
@@ -229,8 +229,11 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_
             case TBX_UCSC: type = "TBX_UCSC"; break;
             default: type = "TBX_GENERIC"; break;
         }
-        hts_log_error("Failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"",
-            type, str->s);
+        if (hts_is_utf16_text(str))
+            hts_log_error("Failed to parse %s: offending line appears to be encoded as UTF-16", type);
+        else
+            hts_log_error("Failed to parse %s: was wrong -p [type] used?\nThe offending line was: \"%s\"",
+                type, str->s);
         return -1;
     }
 }

From 5d3186803e627766aedcc381043a788995dd8f1b Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Thu, 16 May 2024 18:10:51 +0100
Subject: [PATCH 20/76] Avoid possible race when selecting nibble2base
 implementation

Selecting which version of nibble2base() to use on first call
is fine in a single thread, but may lead to a race in
multi-threaded code.  While this is likely harmless (the value
stored to the function pointer will always be the same, and the
update will probably be a single store), it is best to avoid
the problem by making the selection at library initialisation,
before any threads have started.

As the optimised nibble2base_ssse3() already uses gcc function
attributes, it seems reasonable to use __attribute__((constructor))
on the function that selects the version to use.  If the
constructor does not run, it will use the non-optimised version,
which is a safe default.
---
 sam_internal.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/sam_internal.h b/sam_internal.h
index 9457b57cf..aed503925 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -166,20 +166,16 @@ static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
     }
     nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
 }
-static void (*nibble2base)(uint8_t *nib, char *seq, int len);
 
-static void nibble2base_dispatch(uint8_t *nib, char *seq, int len) {
+static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
+
+__attribute__((constructor))
+static void nibble2base_resolve(void) {
     if (__builtin_cpu_supports("ssse3")) {
         nibble2base = nibble2base_ssse3;
     }
-    else {
-        nibble2base = nibble2base_default;
-    }
-    nibble2base(nib, seq, len);
 }
 
-static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_dispatch;
-
 #else
 static inline void nibble2base(uint8_t *nib, char *seq, int len) {
     nibble2base_default(nib, seq, len);

From 60d7aab2c2341a7e115d46c8d310f799f2ff99b8 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Fri, 19 Apr 2024 12:21:50 +0100
Subject: [PATCH 21/76] Add an inline version of bgzf_read for small reads.

The bgzf_read function is long and not something that's likely to get
inlined even if it was in the header.

However most of the time our calls request a small amount of data and
they fit within the buffer we've read, so we offer a static inline to
do the memcpy when we can, falling back to the long function when we
cannot.

In terms of CPU time it's not much difference, but the key thing is
that it's often CPU time saved in a main thread given the bulk of the
decode is often threaded.  An example of test_view -B -@16

develop:

    real    0m48.158s
    user    6m2.901s
    sys     0m28.134s

    real    0m48.730s
    user    6m3.707s
    sys     0m28.473s

    real    0m48.653s
    user    6m5.215s
    sys     0m28.637s

This PR:

    real    0m41.731s
    user    5m59.780s
    sys     0m30.393s

    real    0m41.945s
    user    6m0.367s
    sys     0m30.426s

So we can see it's a consistent win when threading, potentially 10-15%
faster throughput depending on work loads.
---
 htslib/bgzf.h | 20 ++++++++++++++++++++
 sam.c         |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/htslib/bgzf.h b/htslib/bgzf.h
index ea4ec3ece..519606009 100644
--- a/htslib/bgzf.h
+++ b/htslib/bgzf.h
@@ -31,6 +31,7 @@
 #define HTSLIB_BGZF_H
 
 #include <stdint.h>
+#include <string.h>
 #include <sys/types.h>
 
 #include "hts_defs.h"
@@ -143,6 +144,25 @@ typedef struct BGZF BGZF;
     HTSLIB_EXPORT
     ssize_t bgzf_read(BGZF *fp, void *data, size_t length) HTS_RESULT_USED;
 
+/**
+ * bgzf_read optimised for small quantities, as a static inline
+ * See bgzf_read() normal function for return values.
+ */
+static inline ssize_t bgzf_read_small(BGZF *fp, void *data, size_t length) {
+    // A block length of 0 implies current block isn't loaded (see
+    // bgzf_seek_common).  That gives negative available so careful on types
+    if ((ssize_t)length < fp->block_length - fp->block_offset) {
+        // Short cut the common and easy mode
+        memcpy((uint8_t *)data,
+               (uint8_t *)fp->uncompressed_block + fp->block_offset,
+               length);
+        fp->block_offset += length;
+        return length;
+    } else {
+        return bgzf_read(fp, data, length);
+    }
+}
+
     /**
      * Write _length_ bytes from _data_ to the file.  If no I/O errors occur,
      * the complete _length_ bytes will be written (or queued for writing).
diff --git a/sam.c b/sam.c
index 42bcbb5a7..031b80532 100644
--- a/sam.c
+++ b/sam.c
@@ -56,6 +56,8 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/hts_expr.h"
 #include "header.h"
 
+#define bgzf_read bgzf_read_small
+
 #include "htslib/khash.h"
 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
 KHASH_SET_INIT_INT(tag)

From 68564b0361f77c347116196cc184cd1a8eda5d38 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 25 Apr 2024 17:37:32 +0100
Subject: [PATCH 22/76] A few speed ups for bam_read1.

When running in a high thread count, our the decompression stage in
bgzf_read is often the only threaded part meaning whatever is left in
main can become the bottleneck once we have sufficient number of
threads running.  Hence speeding up anything in bam_read1 is key.

- sam_realloc_bam_data has an extra 32 bytes.  This may not seem much,
  especially after rounding up to a power of 2.  However in tests it
  makes a significant reduction to memory copies (and also strangely
  memory size).  Tested with both GNU malloc and tcmalloc.

- bam_tag2cigar speed up by reordering the checks and simplifying the
  expression to look for the necessary cigar field.

- Avoid a bgzf read and copying from bgzf buffer to the temporary x[]
  when we know we can copy direct.  This subverts the bgzf interface,
  but it's internal code.

Some benchmarks of test_view -B -@16 in.bam

develop:     9.73 sec
prev commit: 8.32 sec
this commit: 7.81 sec

Combined this is ~25% speed up.
---
 sam.c | 85 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 23 deletions(-)

diff --git a/sam.c b/sam.c
index 031b80532..a3fd5e60b 100644
--- a/sam.c
+++ b/sam.c
@@ -433,7 +433,8 @@ int sam_realloc_bam_data(bam1_t *b, size_t desired)
     uint32_t new_m_data;
     uint8_t *new_data;
     new_m_data = desired;
-    kroundup32(new_m_data);
+    kroundup32(new_m_data); // next power of 2
+    new_m_data += 32; // reduces malloc arena migrations?
     if (new_m_data < desired) {
         errno = ENOMEM; // Not strictly true but we can't store the size
         return -1;
@@ -674,25 +675,36 @@ hts_pos_t bam_endpos(const bam1_t *b)
 static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
 {
     bam1_core_t *c = &b->core;
-    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
-    uint8_t *CG;
 
-    // test where there is a real CIGAR in the CG tag to move
-    if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
-    cigar0 = bam_get_cigar(b);
-    if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
-    fake_bytes = c->n_cigar * 4;
+    // Bail out as fast as possible for the easy case
+    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
+    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
+        return 0;
+
+    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
+    // but this is much less likely so do as a secondary check.
+    if (c->tid < 0 || c->pos < 0)
+        return 0;
+
+    // Do we have a CG tag?
+    uint8_t *CG = bam_aux_get(b, "CG");
     int saved_errno = errno;
-    CG = bam_aux_get(b, "CG");
     if (!CG) {
         if (errno != ENOENT) return -1;  // Bad aux data
         errno = saved_errno; // restore errno on expected no-CG-tag case
         return 0;
     }
+
+    // Now we start with the serious work migrating CG to CIGAR
+    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
+        *cigar0, CG_len, fake_bytes;
+    cigar0 = bam_get_cigar(b);
+    fake_bytes = c->n_cigar * 4;
     if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
         return 0; // not of type B,I
     CG_len = le_to_u32(CG + 2);
-    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
+    // don't move if the real CIGAR length is shorter than the fake cigar length
+    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
 
     // move from the CG tag to the right position
     cigar_st = (uint8_t*)cigar0 - b->data;
@@ -701,9 +713,12 @@ static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0
     CG_st = CG - b->data - 2;
     CG_en = CG_st + 8 + n_cigar4;
     if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
-    b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
-    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
-    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
+    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
+    b->l_data = b->l_data - fake_bytes + n_cigar4;
+    // insert c->n_cigar-fake_bytes empty space to make room
+    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
+    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
+    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
     if (ori_len > CG_en) // move data after the CG tag
         memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
     b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
@@ -776,16 +791,36 @@ int bam_read1(BGZF *fp, bam1_t *b)
     if (fp->is_be)
         ed_swap_4p(&block_len);
     if (block_len < 32) return -4;  // block_len includes core data
-    if (bgzf_read(fp, x, 32) != 32) return -3;
-    if (fp->is_be) {
-        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
+    if (fp->block_length - fp->block_offset > 32) {
+        // Avoid bgzf_read and a temporary copy to a local buffer
+        uint8_t *x = fp->uncompressed_block + fp->block_offset;
+        c->tid        = le_to_u32(x);
+        c->pos        = le_to_i32(x+4);
+        uint32_t x2   = le_to_u32(x+8);
+        c->bin        = x2>>16;
+        c->qual       = x2>>8&0xff;
+        c->l_qname    = x2&0xff;
+        c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
+        uint32_t x3   = le_to_u32(x+12);
+        c->flag       = x3>>16;
+        c->n_cigar    = x3&0xffff;
+        c->l_qseq     = le_to_u32(x+16);
+        c->mtid       = le_to_u32(x+20);
+        c->mpos       = le_to_i32(x+24);
+        c->isize      = le_to_i32(x+28);
+        fp->block_offset += 32;
+    } else {
+        if (bgzf_read(fp, &x, 32) != 32) return -3;
+        if (fp->is_be) {
+            for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
+        }
+        c->tid = x[0]; c->pos = (int32_t)x[1];
+        c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+        c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
+        c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
+        c->l_qseq = x[4];
+        c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
     }
-    c->tid = x[0]; c->pos = (int32_t)x[1];
-    c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
-    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
-    c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
-    c->l_qseq = x[4];
-    c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
 
     new_l_data = block_len - 32 + c->l_extranul;
     if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
@@ -796,7 +831,7 @@ int bam_read1(BGZF *fp, bam1_t *b)
     b->l_data = new_l_data;
 
     if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
-    if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
+    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
         if (fixup_missing_qname_nul(b) < 0) return -4;
     }
     for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
@@ -808,6 +843,7 @@ int bam_read1(BGZF *fp, bam1_t *b)
     if (bam_tag2cigar(b, 0, 0) < 0)
         return -4;
 
+    // TODO: consider making this conditional
     if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
         hts_pos_t rlen, qlen;
         bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
@@ -4299,6 +4335,9 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
 
             fd->curr_bam = NULL;
             fd->curr_idx = 0;
+        // Consider prefetching next record?  I.e.
+        // } else {
+        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
         }
 
         ret = 0;

From ee9a36f151508102cf84b866bf84626e1c776719 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 29 Apr 2024 12:36:02 +0100
Subject: [PATCH 23/76] Add a bgzf_write_small function for optimising small
 writes.

As we did with bgzf_read_small, shortcutting the big bgzf_write
function for the common use case of a small write that fits into the
buffer can help reduce the pressure on the main thread.

Benchmarks with test_view:

Previous commit:

    for i in `seq 1 3`;do taskset -c 0-31 /usr/bin/time -f '%U user\t%S system\t%e elapsed\t%P %%CPU' ./test/test_view -@32 -b ~/scratch/data/novaseq.10m.bam -p /tmp/_.bam ;done;md5sum /tmp/_.bam
    89.81 user	2.22 system	4.11 elapsed	2237% %CPU
    89.57 user	2.43 system	4.20 elapsed	2189% %CPU
    88.44 user	2.30 system	3.96 elapsed	2291% %CPU
    bc9ca86ebef3b6669fc7b6fdd7e1acb6  /tmp/_.bam

This commit:

    for i in `seq 1 3`;do taskset -c 0-31 /usr/bin/time -f '%U user\t%S system\t%e elapsed\t%P %%CPU' ./test/test_view -@32 -b ~/scratch/data/novaseq.10m.bam -p /tmp/_.bam ;done;md5sum /tmp/_.bam
    86.45 user	1.91 system	3.49 elapsed	2531% %CPU
    86.28 user	1.84 system	3.43 elapsed	2562% %CPU
    86.81 user	2.19 system	3.54 elapsed	2509% %CPU
    bc9ca86ebef3b6669fc7b6fdd7e1acb6  /tmp/_.bam

So that's about 14% faster throughput.  It harms some over places, so
this isn't a blanket bgzf_write to bgzf_write_small define.

Also following the observation above, similarly restricted
bgzf_read_small to bam_read1 instead.  The indirection via the small
function harms big reads, which affects SAM reading.

Benchmarks on ./test/test_view -@32 -b /tmp/_.sam.gz -p /tmp/_.bam
shows this speeds up from 3.7s elapsed to 3.4s elapsed.  Small, but
consistent.
---
 htslib/bgzf.h | 16 ++++++++++++++++
 sam.c         |  8 +++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/htslib/bgzf.h b/htslib/bgzf.h
index 519606009..886c60767 100644
--- a/htslib/bgzf.h
+++ b/htslib/bgzf.h
@@ -175,6 +175,22 @@ static inline ssize_t bgzf_read_small(BGZF *fp, void *data, size_t length) {
     HTSLIB_EXPORT
     ssize_t bgzf_write(BGZF *fp, const void *data, size_t length) HTS_RESULT_USED;
 
+/**
+ * bgzf_write optimised for small quantities, as a static inline
+ * See bgzf_write() normal function for return values.
+ */
+static inline ssize_t bgzf_write_small(BGZF *fp, void *data, size_t length) {
+    if (fp->is_compressed && BGZF_BLOCK_SIZE - fp->block_offset > length) {
+        // Short cut the common and easy mode
+        memcpy((uint8_t *)fp->uncompressed_block + fp->block_offset,
+               data, length);
+        fp->block_offset += length;
+        return length;
+    } else {
+        return bgzf_write(fp, data, length);
+    }
+}
+
     /**
      * Write _length_ bytes from _data_ to the file, the index will be used to
      * decide the amount of uncompressed data to be written to each bgzip block.
diff --git a/sam.c b/sam.c
index a3fd5e60b..a1adb91ae 100644
--- a/sam.c
+++ b/sam.c
@@ -56,8 +56,6 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/hts_expr.h"
 #include "header.h"
 
-#define bgzf_read bgzf_read_small
-
 #include "htslib/khash.h"
 KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
 KHASH_SET_INIT_INT(tag)
@@ -776,6 +774,7 @@ static int fixup_missing_qname_nul(bam1_t *b) {
  * Note a second interface that returns a bam pointer instead would avoid bam_copy1
  * in multi-threaded handling.  This may be worth considering for htslib2.
  */
+#define bgzf_read bgzf_read_small
 int bam_read1(BGZF *fp, bam1_t *b)
 {
     bam1_core_t *c = &b->core;
@@ -793,7 +792,7 @@ int bam_read1(BGZF *fp, bam1_t *b)
     if (block_len < 32) return -4;  // block_len includes core data
     if (fp->block_length - fp->block_offset > 32) {
         // Avoid bgzf_read and a temporary copy to a local buffer
-        uint8_t *x = fp->uncompressed_block + fp->block_offset;
+        uint8_t *x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
         c->tid        = le_to_u32(x);
         c->pos        = le_to_i32(x+4);
         uint32_t x2   = le_to_u32(x+8);
@@ -859,7 +858,9 @@ int bam_read1(BGZF *fp, bam1_t *b)
 
     return 4 + block_len;
 }
+#undef bgzf_read
 
+#define bgzf_write bgzf_write_small
 int bam_write1(BGZF *fp, const bam1_t *b)
 {
     const bam1_core_t *c = &b->core;
@@ -927,6 +928,7 @@ int bam_write1(BGZF *fp, const bam1_t *b)
     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
     return ok? 4 + block_len : -1;
 }
+#undef bgzf_write
 
 /*
  * Write a BAM file and append to the in-memory index simultaneously.

From db34f68470a684bdf6d41fdb799bb710d4e386da Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 9 May 2024 15:11:56 +0100
Subject: [PATCH 24/76] Changes from review

---
 sam.c | 78 ++++++++++++++++++++++++++---------------------------------
 1 file changed, 34 insertions(+), 44 deletions(-)

diff --git a/sam.c b/sam.c
index a1adb91ae..e39ec3f85 100644
--- a/sam.c
+++ b/sam.c
@@ -774,16 +774,16 @@ static int fixup_missing_qname_nul(bam1_t *b) {
  * Note a second interface that returns a bam pointer instead would avoid bam_copy1
  * in multi-threaded handling.  This may be worth considering for htslib2.
  */
-#define bgzf_read bgzf_read_small
 int bam_read1(BGZF *fp, bam1_t *b)
 {
     bam1_core_t *c = &b->core;
     int32_t block_len, ret, i;
-    uint32_t x[8], new_l_data;
+    uint32_t new_l_data;
+    uint8_t tmp[32], *x;
 
     b->l_data = 0;
 
-    if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
+    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
         if (ret == 0) return -1; // normal end-of-file
         else return -2; // truncated
     }
@@ -792,34 +792,27 @@ int bam_read1(BGZF *fp, bam1_t *b)
     if (block_len < 32) return -4;  // block_len includes core data
     if (fp->block_length - fp->block_offset > 32) {
         // Avoid bgzf_read and a temporary copy to a local buffer
-        uint8_t *x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
-        c->tid        = le_to_u32(x);
-        c->pos        = le_to_i32(x+4);
-        uint32_t x2   = le_to_u32(x+8);
-        c->bin        = x2>>16;
-        c->qual       = x2>>8&0xff;
-        c->l_qname    = x2&0xff;
-        c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
-        uint32_t x3   = le_to_u32(x+12);
-        c->flag       = x3>>16;
-        c->n_cigar    = x3&0xffff;
-        c->l_qseq     = le_to_u32(x+16);
-        c->mtid       = le_to_u32(x+20);
-        c->mpos       = le_to_i32(x+24);
-        c->isize      = le_to_i32(x+28);
+        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
         fp->block_offset += 32;
     } else {
-        if (bgzf_read(fp, &x, 32) != 32) return -3;
-        if (fp->is_be) {
-            for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
-        }
-        c->tid = x[0]; c->pos = (int32_t)x[1];
-        c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
-        c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
-        c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
-        c->l_qseq = x[4];
-        c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
-    }
+        x = tmp;
+        if (bgzf_read(fp, x, 32) != 32) return -3;
+    }
+
+    c->tid        = le_to_u32(x);
+    c->pos        = le_to_i32(x+4);
+    uint32_t x2   = le_to_u32(x+8);
+    c->bin        = x2>>16;
+    c->qual       = x2>>8&0xff;
+    c->l_qname    = x2&0xff;
+    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
+    uint32_t x3   = le_to_u32(x+12);
+    c->flag       = x3>>16;
+    c->n_cigar    = x3&0xffff;
+    c->l_qseq     = le_to_u32(x+16);
+    c->mtid       = le_to_u32(x+20);
+    c->mpos       = le_to_i32(x+24);
+    c->isize      = le_to_i32(x+28);
 
     new_l_data = block_len - 32 + c->l_extranul;
     if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
@@ -829,14 +822,14 @@ int bam_read1(BGZF *fp, bam1_t *b)
     if (realloc_bam_data(b, new_l_data) < 0) return -4;
     b->l_data = new_l_data;
 
-    if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
+    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
     if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
         if (fixup_missing_qname_nul(b) < 0) return -4;
     }
     for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
     c->l_qname += c->l_extranul;
     if (b->l_data < c->l_qname ||
-        bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
+        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
         return -4;
     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
     if (bam_tag2cigar(b, 0, 0) < 0)
@@ -858,9 +851,7 @@ int bam_read1(BGZF *fp, bam1_t *b)
 
     return 4 + block_len;
 }
-#undef bgzf_read
 
-#define bgzf_write bgzf_write_small
 int bam_write1(BGZF *fp, const bam1_t *b)
 {
     const bam1_core_t *c = &b->core;
@@ -891,15 +882,15 @@ int bam_write1(BGZF *fp, const bam1_t *b)
     if (fp->is_be) {
         for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
         y = block_len;
-        if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
+        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
         swap_data(c, b->l_data, b->data, 1);
     } else {
-        if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
+        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
     }
-    if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
-    if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
+    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
+    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
     if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
-        if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
+        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
     } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
         uint8_t buf[8];
         uint32_t cigar_st, cigar_en, cigar[2];
@@ -918,17 +909,16 @@ int bam_write1(BGZF *fp, const bam1_t *b)
         cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
         u32_to_le(cigar[0], buf);
         u32_to_le(cigar[1], buf + 4);
-        if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
-        if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
-        if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
+        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
+        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
+        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
         u32_to_le(c->n_cigar, buf);
-        if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
-        if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
+        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
+        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
     }
     if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
     return ok? 4 + block_len : -1;
 }
-#undef bgzf_write
 
 /*
  * Write a BAM file and append to the in-memory index simultaneously.

From 11205a9ba5e4fc39cc8bb9844d73db2a63fb8119 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 4 Jun 2024 11:38:03 +0100
Subject: [PATCH 25/76] Add tests for BAM alignment records that span BGZF
 blocks

HTSlib starts a new block if an alignment is likely to overflow
the current one, so for its own data this will only happen for
records longer than 64kbytes.  As other implementations may not do
this, check that reading works correctly on some BAM files where
records have been deliberately split between BGZF blocks.

Additionally, check the writing side by making a record with
enough CIGAR entries to make it split into multiple BGZF blocks.
---
 .gitignore                                |   1 +
 Makefile                                  |   1 +
 test/bgzf_boundaries/bgzf_boundaries1.bam | Bin 0 -> 432 bytes
 test/bgzf_boundaries/bgzf_boundaries2.bam | Bin 0 -> 432 bytes
 test/bgzf_boundaries/bgzf_boundaries3.bam | Bin 0 -> 897 bytes
 test/test.pl                              |  37 ++++++++++++++++++++++
 6 files changed, 39 insertions(+)
 create mode 100644 test/bgzf_boundaries/bgzf_boundaries1.bam
 create mode 100644 test/bgzf_boundaries/bgzf_boundaries2.bam
 create mode 100644 test/bgzf_boundaries/bgzf_boundaries3.bam

diff --git a/.gitignore b/.gitignore
index 8b4d74ca1..9b7e26f71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,7 @@ shlib-exports-*.txt
 /bgzip
 /htsfile
 /tabix
+/test/bgzf_boundaries/*.tmp.*
 /test/faidx/*.tmp*
 /test/faidx/FAIL*
 /test/fieldarith
diff --git a/Makefile b/Makefile
index 54bca03d0..8e2cccdfc 100644
--- a/Makefile
+++ b/Makefile
@@ -914,6 +914,7 @@ htslib-uninstalled.pc: htslib.pc.tmp
 testclean:
 	-rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* test/faidx/FAIL* \
                test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* \
+               test/bgzf_boundaries/*.tmp.* \
                header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt
 	-rm -rf htscodecs/tests/test.out
 
diff --git a/test/bgzf_boundaries/bgzf_boundaries1.bam b/test/bgzf_boundaries/bgzf_boundaries1.bam
new file mode 100644
index 0000000000000000000000000000000000000000..264e22fade451667e7402884dc87870e2fae86c6
GIT binary patch
literal 432
zcmb2|=3rp}f&Xj_PR>jWaSV(O4Ez5(Ir=in14SKz1382Jteic9{C)j{{e4~IJvn{+
ztPBkdEG-NSxEO&-c!3yMo}nXypF!%8$2<q1gglz@3Ji=)41fQBW@udCQOATufiNTE
zR|c;CKp7wa886J}Da6bmz>orD|AzvQTyRj3ftjhXrGcKIiKV%Pv855i2B3N&pwtAQ
zkCYmnlp2L#$Vo$xAY#G?<Q3(_73CBa<mHu>6_u2gl@yiam6U;yG!WtL%oyS8%o^e6
n%n;%4%;*@L{4N5BBSWG>LmB*vZH2!A-H#q<(hSTXAA<n^4?#|L

literal 0
HcmV?d00001

diff --git a/test/bgzf_boundaries/bgzf_boundaries2.bam b/test/bgzf_boundaries/bgzf_boundaries2.bam
new file mode 100644
index 0000000000000000000000000000000000000000..704804eaf4b01d841a5c5159c009645b642ac79a
GIT binary patch
literal 432
zcmb2|=3rp}f&Xj_PR>jWaSV(O4Ez5(Ir=in14SKz1382Jteic9{C)j{{e4~IJvn{+
ztPBkdEG-NSxEO&-c!3yMo}nXypF!%8$2<q1gglz@x(tl`3?Ke~21<bdNWU<nXX4yS
z1%4nCT{Rmc;|m7y|3b_R0t_iY`TtM=WHJN?1sRx`8e1CZ8Jbv{TNqm!F>C-DCj^w5
z0Q8Ykqmxpj5DYnK2ogk0_<+2koVcQ#qJq4<va+I*va*t*lDv{K5RwKW+?^RCe4SY%
p{G1sg{GAycgOlGy0C8kURA?we>7(y$FM#ex4+d!lW{{7;006j}Pw4;v

literal 0
HcmV?d00001

diff --git a/test/bgzf_boundaries/bgzf_boundaries3.bam b/test/bgzf_boundaries/bgzf_boundaries3.bam
new file mode 100644
index 0000000000000000000000000000000000000000..328a274518ca22ced4adc0c7e5e01bfdd9359743
GIT binary patch
literal 897
zcmb2|=3rp}f&Xj_PR>jWaSV(O4Ez5(Ir=in14SKz1382Jteic9{C)j{{e4~IJvn{+
ztPBkdEG-NSxEO&-c!3yMo}nXypF!%8$2<q1gglz@x(tl`3?Ke~21<bdNWU<nXX4yS
z1%4nChiV~a1_6c?p!|O@*v=c7j7RnV|3H-t!9hU=W~RoL2D#T6{^L=tXJ}$+ZeeU`
z#IOPA7@<v_osoD{0}Y-43@)WcC#6OqrN)DwYy|MA2C4ueCk;UeQd<58pV@GA5Qh2$
z2R=MzBdca&ViKIdKNXK^pbBKb_}t_>ajNAN<-~n7pFP2&T2W3>L0(>2Sy4$@SxHgp
z+9%(9JgViDlz|WqtjaQW;86`%2VpE)d~*u5RY$lxGe-D2v;KbmdNm%iBmA5hBK(~h
y9fOnKMFb~nhH~0sQ?1It$j<QN|GUVLsL)V`o*Azc*@0;WJzYyPFoV)47ytlGQLteE

literal 0
HcmV?d00001

diff --git a/test/test.pl b/test/test.pl
index 9beed91bb..8aaf044d3 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -819,6 +819,43 @@ sub test_view
         }
     }
 
+    # BAM files with alignment records that span BGZF blocks
+    # HTSlib starts a new block if an alignment is likely to overflow the
+    # current one, so for its own data this will only happen for records
+    # longer than 64kbytes.  As other implementations may not do this,
+    # check that reading works correctly on some BAM files where records
+    # have been deliberately split between BGZF blocks.
+    print "test_view testing BAM records in multiple BGZF blocks:\n";
+    $test_view_failures = 0;
+    my $src_sam = "ce#1.sam";
+    foreach my $test_bam (qw(bgzf_boundaries/bgzf_boundaries1.bam
+                          bgzf_boundaries/bgzf_boundaries2.bam
+                          bgzf_boundaries/bgzf_boundaries3.bam)) {
+        testv $opts, "./test_view $tv_args -p $test_bam.tmp.sam $test_bam";
+        testv $opts, "./compare_sam.pl $test_bam.tmp.sam $src_sam";
+    }
+
+    # Test a file with a long alignment record.  Boundaries hit in the middle of
+    # the CIGAR data, and in the sequence.  Generate the test file here as it's
+    # big, but with fairly simple contents.
+    $src_sam = "bgzf_boundaries/large_rec.tmp.sam";
+    open(my $test_sam, '>', $src_sam) || die "Couldn't open $src_sam : $!\n";
+    print $test_sam "\@HD\tVN:1.6\tSO:coordinate\n";
+    print $test_sam "\@SQ\tSN:ref\tLN:100000\n";
+    print $test_sam "read\t0\tref\t1\t60\t", "1M1I" x 16000, "\t*\t0\t0\t", "A" x 32000, "\t", "Q" x 32000, "\n";
+    close($test_sam) || die "Error on closing $src_sam : $!\n";
+
+    testv $opts, "./test_view $tv_args -b -l 0 -p $src_sam.bam $src_sam";
+    testv $opts, "./test_view $tv_args -p $src_sam.bam.sam $src_sam.bam";
+    testv $opts, "./compare_sam.pl $src_sam $src_sam.bam.sam";
+
+    if ($test_view_failures == 0) {
+        passed($opts, "BAM records spanning multiple BGZF block tests");
+    } else {
+        failed($opts, "BAM records spanning multiple BGZF block tests",
+               "$test_view_failures subtests failed");
+    }
+
     # embed_ref=2 mode
     print "test_view testing embed_ref=2:\n";
     $test_view_failures = 0;

From 1c8c335d5c3a5a29edbe03b9d12338404aff8b48 Mon Sep 17 00:00:00 2001
From: Andrew Whitwham <aw7@sanger.ac.uk>
Date: Fri, 31 May 2024 09:51:14 +0100
Subject: [PATCH 26/76] Make BAM zero-length intervals work like CRAM

For a BED file with:
chr1 1000 1000

Worked with CRAM multi-region iterator but not with BAM.  This change makes them work the same.  Fixes #2060.
---
 hts.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hts.c b/hts.c
index fc745cd95..923eaaf02 100644
--- a/hts.c
+++ b/hts.c
@@ -3258,7 +3258,7 @@ static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid,
     size_t reg_bin_count = 0, hash_bin_count;
     int res;
 
-    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end)
+    if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg > end)
         return -1;
 
     hash_bin_count = kh_n_buckets(bidx);

From 356bfd6b9a573afeed5e4e266f9e15ad71f10896 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 31 May 2024 14:41:55 +0100
Subject: [PATCH 27/76] Fix possible out-of-bounds read in hts_itr_multi_next()
 (CRAM only)

When multi-iterating CRAM files, hts_itr_multi_next()
tries to calculate the chr:start-end range so that the
multi-threaded CRAM decoder does not do unnecessary work.
If it finds that the iterator is going to switch to another
reference, this optimisation gets turned off and the end
limit is ignored.

Unfortunately, the original version of this code, added in
commit d3147150, did not disable the end point update after it
had detected a switch to a different reference.  This could lead
to an out-of-bounds read because it did not switch to the correct
intervals array for the later references.  The end values looked
up in that case were not used, but it could cause a segfault
if the later references had many more intervals requested than
the first one.  Fix by only updating end when on the correct
reference.

This bug is present in releases 1.11 to 1.20
---
 hts.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/hts.c b/hts.c
index 923eaaf02..caf85e64a 100644
--- a/hts.c
+++ b/hts.c
@@ -4449,11 +4449,12 @@ int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r)
                                     break;
 
                                 uint64_t max = iter->off[j].max;
-                                if ((max>>32) != tid)
+                                if ((max>>32) != tid) {
                                     tid = HTS_IDX_START; // => no range limit
-
-                                if (end < rl->intervals[max & 0xffffffff].end)
-                                    end = rl->intervals[max & 0xffffffff].end;
+                                } else {
+                                    if (end < rl->intervals[max & 0xffffffff].end)
+                                        end = rl->intervals[max & 0xffffffff].end;
+                                }
                                 if (v < iter->off[j].v)
                                     v = iter->off[j].v;
                                 j++;

From 5f205334cd04bcdac74b1b6d46f88d57a05eaf1c Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 3 Jun 2024 13:54:16 +0100
Subject: [PATCH 28/76] Add regression test for hts_itr_multi_next() OOB read

Add a multi-region iterator look-up on range.cram, with one
CHROMOSOME_I region and enough CHROMOSOME_II to trigger the
bug on the unfixed version of the function.
---
 test/range.out2 | 21 +++++++++++++++++++++
 test/test.pl    | 12 ++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 test/range.out2

diff --git a/test/range.out2 b/test/range.out2
new file mode 100644
index 000000000..22e6fd542
--- /dev/null
+++ b/test/range.out2
@@ -0,0 +1,21 @@
+@HD	VN:1.4	SO:coordinate
+@RG	ID:1	PL:ILLUMINA	PU:130410_HS18_09653_A_C1JT2ACXX_4	LB:7053878	DT:2013-04-10T00:00:00+0100	SM:ERS225193	CN:SC
+@SQ	SN:CHROMOSOME_I	LN:1009800	M5:8ede36131e0dbf3417807e48f77f3ebd	UR:/
+@SQ	SN:CHROMOSOME_II	LN:5000	M5:8e7993f7a93158587ee897d7287948ec	UR:/
+@SQ	SN:CHROMOSOME_III	LN:5000	M5:3adcb065e1cf74fafdbba1e8c352b323	UR:/
+@SQ	SN:CHROMOSOME_IV	LN:5000	M5:251af66a69ee589c9f3757340ec2de6f	UR:/
+@SQ	SN:CHROMOSOME_V	LN:5000	M5:cf200a65fb754836dcc56b24b3170ee8	UR:/
+@SQ	SN:CHROMOSOME_X	LN:5000	M5:6f9368fd2192c89c613718399d2d31fc	UR:/
+@SQ	SN:CHROMOSOME_MtDNA	LN:5000	M5:cd05857ece6411f40257a565ccfe15bb	UR:/
+@PG	ID:scramble	PN:scramble	VN:1.14.7	CL:scramble -M -I sam -s 50 -r /tmp/ce.fa - /tmp/ERR304769_subset.cram
+HS18_09653:4:2108:14085:93656	147	CHROMOSOME_I	1122	60	100M	=	756	-466	AATTTGCAAGAAAATTCGCAAGAAATTTGTATTAAAAACTGTTCAAAATTTTTGGAAATTAGTTTAAAAATCTCACATTTTTTTTAGAAAAATTATTTTT	GEFGHHFHEGGIFEFHFH<HHGGEFIAHEEFGEHFHFDFGDHG@HGGFFIIHHG8HICFBCEGICHEGIBHEHH;CGGFDGGJFFHGGDGGFFFEGDDE?	X0:i:1	X1:i:0	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2302:10109:87885	147	CHROMOSOME_II	1136	60	100M	=	734	-502	TGATTCATTTTATATTCTATATACTCATGTAATATGCCCATGTAAGGTTTAATTCCAAAAATATGAGCGTGTTCTATTTTATAATATTTTACTAAAATAC	GFGFEEGBEHH8BEFHCGGFEF.G:GB9FBFGEA@FB:<GF<G8FDFCGFCHBEHHDEEFDBGFCFHIEBCDE-ACGGGIFDDFFEFCHA=FDFAGDCC?	X0:i:1	X1:i:0	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2108:10782:59721	163	CHROMOSOME_II	1241	60	100M	=	1366	225	AGTTAATTGCACTCAAATTTGTTGTTCTTCATTCTCTCGTTATGATTTAATCTTATTGCGTCAAGGTCATTATTTTAGGTCCATTAGTTATCGATCTGAA	?EDDGEFFFGGHIIHFGIHGHEHDHGDEHHGHGEHGHGHHGGHIGGGIGGIGIGGHHGHHGHHFFIHJGFHGHHGLGHGHGGHFKEHGIGGH@FHIFGFG	X0:i:1	X1:i:0	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:1107:10640:32305	99	CHROMOSOME_II	1267	60	100M	=	1778	611	CTTCATTCTCTCGTTATGATTTAATCTTATTGCGTCAAGGTCATTATTTTAGGTCCATTAGTTATCGATCTGAAACATGTTGTTGTATTTTTCTATTCTT	D?EFDEGGFFGEGHGGGHIGGGHHEGHIFGGEGFCGHFHGCHGGGHFKIHGGGGHFHFFHFFGGGGHGHHGHGFFHGGGHHGFGHGFGHFGGFHHBHGFF	X0:i:1	X1:i:0	BC:Z:GTNTGCNG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:<<!2@@!2	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2213:16876:56957	99	CHROMOSOME_II	1326	60	100M	=	1651	425	AGTTATCGATCTGAAACATGTTGTTGTATTTTTCTATTCTTGTGAGCTCAGGACACCTCATACAACTCCAGAGAAAATGTGTCTCATTATTCTTGTCTTT	BCEDDD:AAFGDG<F2DGHHGGGHDECGGG@GGFD/HGFGCB0GEEFGDHGHEGHFGHCFGHGFGHFHHHDFGFGHGEF.F-HGGBEGFGGGFHHGHEEI	X0:i:1	X1:i:0	BC:Z:GTNTGCNG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:<<!2@@!4	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2211:18838:86627	147	CHROMOSOME_II	1345	60	100M	=	828	-617	GTTGTTGTATTTTTCTATTCTTGTGAGCTCAGGACACCTCATACAACTCCAGAGAAAATGTGTCTCATTATTCTTGTCTTTTTTCAAGATCTAATCAATT	<GFDIEGH@FGDBFFEFFGFGAGHHAFH@HHGFHIHFGHGHG:CH:GCFHGGHGHGFEHGCJEJCGHGIGEEEFGFFGBHGHGEFHFCFGGG>FECDDE?	X0:i:1	X1:i:0	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2314:21094:58611	99	CHROMOSOME_II	1353	60	100M	=	1775	522	ATTTTTCTATTCTTGTGAGCTCAGGACACCTCATACAACTCCAGAGAAAATGTGTCTCATTATTCTTGTCTTTTTTCAAGATCTAATCAATTTTCTACAT	D;?FBD9CDBGBGG?GF8DFGFFHDACDGFGGD/HGHHGFFEFGD=FGIG0D.GH7HHFFGFDGGFF:HFDGGHGGGGE;F:@GGEGGCFGFGHHB@FHG	X0:i:1	X1:i:0	BC:Z:GTNTGCCG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:=?!4AD22	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2108:10782:59721	83	CHROMOSOME_II	1366	60	100M	=	1241	-225	TGTGAGCTCAGGACACCTCATACAACTCCAGAGAAAATGTGTCTCATTATTCTTGTCTTTTTTCAAGATCTAATCAATTTTCTACATTAACGACGTTTTT	IFGHDHHFFGHIIEGGGHEHHJGGGFGGHFHHGFGGGGGGHHDHFHGIF=IFIFHIGIHGHF=HGJGGGFGGGHEEHGFGGFGEGGGGEGFFGGGFEBCD	X0:i:1	X1:i:0	BC:Z:GTNTGCCG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:=?!4AD+2	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2111:5602:28724	99	CHROMOSOME_II	1416	60	100M	=	1881	565	TCTTGTCTTTTTTCAAGATCTAATCAATTTTCTACATTAACGACGTTTTTGTCGTTCTGCTTCTTTTTTTCGTTCGTTTGTCTCGTCCATCAGCTGTCCA	ECE>EGGGGFGGGGDGFEFGGGFHEGHGIIFGFEJGHHFGGGHFGEFHIHGFFGGECGFHHGGFGHIHHHGEGGHBGBGHHEHGEBGGFFGFFHHGCGFF	X0:i:1	X1:i:0	BC:Z:GTNTGCCG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:=@!4AD24	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:2103:6720:15025	99	CHROMOSOME_II	1459	60	100M	=	1617	258	CGTTTTTGTCGTTCTGCTTCTTTTTTTCGTTCGTTTGTCTCGTCCATCAGCTGTCCACTCATTTCTCTCCCACTCACTAGGCAGTGCTTTGTTTGGTTCC	ECEFFGGGEHGEGGGGGGHFGGGHIGHIGGGG?HFGHGEGFBFGGGFGIHGDGGDEDFF<FGGGGHFGGFFAGEGBGGCHFEFGGGEHEHGDGF:FFFFC	X0:i:1	X1:i:0	BC:Z:GTNTGCGG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:=?!4AD22	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
+HS18_09653:4:1316:7415:40818	99	CHROMOSOME_II	1536	60	100M	=	1839	403	TAGGCAGTGCTTTGTTTGGTTCCGATTGGCAGCTGGCTGCAGGGCCTGCATCTCTTCTATGTCTCTCATTTACTTGCATTCTTTTCTTCGTTAATTTTTG	AC?>FGGAEFGGGDDGEGGFGGEEEGEIFGFG@E<GH>>EFGDG?HCFCF>DGGHDFFCHF>=G;CFBEHG<GCCGGEEHFDHGGHGGFFGGDDFHHGH?	X0:i:1	X1:i:0	BC:Z:GTNTGCGG	XG:i:0	AM:i:37	SM:i:37	XM:i:0	XO:i:0	QT:Z:=?!4AD+2	XT:A:U	MD:Z:100	NM:i:0	RG:Z:1
diff --git a/test/test.pl b/test/test.pl
index 8aaf044d3..ef6a56612 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -887,6 +887,18 @@ sub test_view
     testv $opts, "./test_view $tv_args range.bam $regions > range.tmp";
     testv $opts, "./compare_sam.pl range.tmp range.out";
 
+    # Regression check for out-of-bounds read on regions list (see
+    # samtools#2063).  As reg_insert() allocates at least four slots
+    # for chromosome regions, we need more than that many in the second
+    # chr. requested to ensure it has a bigger array.
+
+    $regions = "CHROMOSOME_I:1122-1122 CHROMOSOME_II:1136-1136 CHROMOSOME_II:1241-1241 CHROMOSOME_II:1267-1267 CHROMOSOME_II:1326-1326 CHROMOSOME_II:1345-1345 CHROMOSOME_II:1353-1353 CHROMOSOME_II:1366-1366 CHROMOSOME_II:1416-1416 CHROMOSOME_II:1459-1459 CHROMOSOME_II:1536-1536";
+    testv $opts, "./test_view $tv_args -i reference=ce.fa -M range.cram $regions > range.tmp";
+    testv $opts, "./compare_sam.pl range.tmp range.out2";
+
+    testv $opts, "./test_view $tv_args -M range.bam $regions > range.tmp";
+    testv $opts, "./compare_sam.pl range.tmp range.out2";
+
     if ($test_view_failures == 0) {
         passed($opts, "range.cram tests");
     } else {

From 61b922b4d95c70af248e4c21bd552172ae3f82d1 Mon Sep 17 00:00:00 2001
From: Martin Pollard <mp15@sanger.ac.uk>
Date: Thu, 20 Jun 2024 11:48:50 +0100
Subject: [PATCH 29/76] Replace assert(0) with abort()

---
 synced_bcf_reader.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c
index 9a3488a70..1835ea2d6 100644
--- a/synced_bcf_reader.c
+++ b/synced_bcf_reader.c
@@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE.  */
 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
 #include <config.h>
 
+#include <stdlib.h>
 #include <assert.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -542,7 +543,7 @@ static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_
     }
     if (!reader->itr) {
         hts_log_error("Could not seek: %s:%"PRIhts_pos"-%"PRIhts_pos, seq, start + 1, end + 1);
-        assert(0);
+        abort();
     }
     return 0;
 }

From a0969f42cf9d9fc9add367aff591cb31f170d7b6 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 1 Jul 2024 14:46:25 +0100
Subject: [PATCH 30/76] Fix a missing bgzf->uncompressed_address incr in
 bgzf_read_small

This bug crept in with #1772 which was added since last release, so
there is no regression.

Fixes #1798 with thanks to John Marshall
---
 htslib/bgzf.h    |  4 +++-
 test/test_bgzf.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/htslib/bgzf.h b/htslib/bgzf.h
index 886c60767..c6ce7c172 100644
--- a/htslib/bgzf.h
+++ b/htslib/bgzf.h
@@ -157,6 +157,7 @@ static inline ssize_t bgzf_read_small(BGZF *fp, void *data, size_t length) {
                (uint8_t *)fp->uncompressed_block + fp->block_offset,
                length);
         fp->block_offset += length;
+        fp->uncompressed_address += length;
         return length;
     } else {
         return bgzf_read(fp, data, length);
@@ -179,7 +180,8 @@ static inline ssize_t bgzf_read_small(BGZF *fp, void *data, size_t length) {
  * bgzf_write optimised for small quantities, as a static inline
  * See bgzf_write() normal function for return values.
  */
-static inline ssize_t bgzf_write_small(BGZF *fp, void *data, size_t length) {
+static inline
+ssize_t bgzf_write_small(BGZF *fp, const void *data, size_t length) {
     if (fp->is_compressed && BGZF_BLOCK_SIZE - fp->block_offset > length) {
         // Short cut the common and easy mode
         memcpy((uint8_t *)fp->uncompressed_block + fp->block_offset,
diff --git a/test/test_bgzf.c b/test/test_bgzf.c
index 6cb6db902..244ababc5 100644
--- a/test/test_bgzf.c
+++ b/test/test_bgzf.c
@@ -179,7 +179,7 @@ static int try_bgzf_close(BGZF **bgz, const char *name, const char *func, int ex
 
 static ssize_t try_bgzf_read(BGZF *fp, void *data, size_t length,
                              const char *name, const char *func) {
-    ssize_t got = bgzf_read(fp, data, length);
+    ssize_t got = bgzf_read_small(fp, data, length);
     if (got < 0) {
         fprintf(stderr, "%s : Error from bgzf_read %s : %s\n",
                 func, name, strerror(errno));
@@ -189,7 +189,7 @@ static ssize_t try_bgzf_read(BGZF *fp, void *data, size_t length,
 
 static ssize_t try_bgzf_write(BGZF *fp, const void *data, size_t length,
                               const char *name, const char *func) {
-    ssize_t put = bgzf_write(fp, data, length);
+    ssize_t put = bgzf_write_small(fp, data, length);
     if (put < (ssize_t) length) {
         fprintf(stderr, "%s : %s %s : %s\n",
                 func, put < 0 ? "Error writing to" : "Short write on",
@@ -878,6 +878,49 @@ static int test_tell_read(Files *f, const char *mode) {
     return -1;
 }
 
+static int test_useek_read_small(Files *f, const char *mode) {
+
+    BGZF* bgz = NULL;
+    char bg_buf[99];
+
+    bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__);
+    if (!bgz) goto fail;
+
+
+    if (try_bgzf_write(bgz, "#>Hello, World!\n", 16,
+                       f->tmp_bgzf, __func__) != 16)
+        goto fail;
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
+
+    bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__);
+    if (!bgz) goto fail;
+
+    if (try_bgzf_getc(bgz, 0, '#', f->tmp_bgzf, __func__) < 0 ||
+        try_bgzf_getc(bgz, 1, '>', f->tmp_bgzf, __func__) < 0)
+        goto fail;
+
+    if (try_bgzf_read(bgz, bg_buf, 5, f->tmp_bgzf, __func__) != 5)
+        goto fail;
+    if (memcmp(bg_buf, "Hello", 5) != 0)
+        goto fail;
+
+    if (try_bgzf_useek(bgz, 9, SEEK_SET, f->tmp_bgzf, __func__) < 0)
+        goto fail;
+
+    if (try_bgzf_read(bgz, bg_buf, 5, f->tmp_bgzf, __func__) != 5)
+        goto fail;
+    if (memcmp(bg_buf, "World", 5) != 0)
+        goto fail;
+
+    if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail;
+    return 0;
+
+ fail:
+    fprintf(stderr, "%s: failed\n", __func__);
+    if (bgz) bgzf_close(bgz);
+    return -1;
+}
+
 static int test_bgzf_getline(Files *f, const char *mode, int nthreads) {
     BGZF* bgz = NULL;
     ssize_t bg_put;
@@ -1098,6 +1141,10 @@ int main(int argc, char **argv) {
     if (test_tell_read(&f, "w") != 0) goto out;
     if (test_tell_read(&f, "wu") != 0) goto out;
 
+    // bgzf_useek and bgzf_read_small
+    if (test_useek_read_small(&f, "w") != 0) goto out;
+    if (test_useek_read_small(&f, "wu") != 0) goto out;
+
     // getline
     if (test_bgzf_getline(&f, "w", 0) != 0) goto out;
     if (test_bgzf_getline(&f, "w", 1) != 0) goto out;

From abe85836d525ef484e79243f3f992b638930ced0 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 2 Jul 2024 14:35:27 +0100
Subject: [PATCH 31/76] Protect against negative POS fields in CRAM encoder.

We check pos against the ref end, and if that fits we then fetch
`&ref[apos]` to get the reference, but this understeps the array.

The fastest way to handle all this and other related errors is simply
to sanity check the input for out of bounds POS and bail out early.

Credit to OSS-Fuzz
Fixes oss-fuzz 70014
---
 cram/cram_encode.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cram/cram_encode.c b/cram/cram_encode.c
index 4a762f7b0..d59ea5253 100644
--- a/cram/cram_encode.c
+++ b/cram/cram_encode.c
@@ -3401,6 +3401,8 @@ static int process_one_read(cram_fd *fd, cram_container *c,
 
     c->num_bases   += cr->len;
     cr->apos        = bam_pos(b)+1;
+    if (cr->apos < 0 || cr->apos > INT64_MAX/2)
+        goto err;
     if (c->pos_sorted) {
         if (cr->apos < s->last_apos && !fd->ap_delta) {
             c->pos_sorted = 0;

From c722ae5dd370dd8d1fea609f7b5f601b18a2ee68 Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 22 Apr 2024 13:36:00 +0200
Subject: [PATCH 32/76] Move nibble2base_ssse3() SIMD code to new simd.c source
 file

This function is always called via a function pointer, hence it is
never inlined so need not be in a header.

[Commit marked as authored by RV so git blame attributes the moved
code to Ruben, as it is unchanged from the original code. Temporarily
include simd.c from the header so this commit builds and runs. -JM]
---
 sam_internal.h | 65 +-------------------------------------------------
 simd.c         | 64 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 64 deletions(-)
 create mode 100644 simd.c

diff --git a/sam_internal.h b/sam_internal.h
index aed503925..00be2c215 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -102,70 +102,7 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
 #if HTS_BUILD_IS_X86_64 \
     && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
     && HAVE_BUILTIN_CPU_SUPPORT_SSSE3
-#include "immintrin.h"
-/*
- * Convert a nibble encoded BAM sequence to a string of bases.
- *
- * Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be
- * unpacked into 32 indexes from 0-15. Using the pshufb instruction these can
- * be converted to the IUPAC characters.
- * It falls back on the nibble2base_default function for the remainder.
- */
-
-__attribute__((target("ssse3")))
-static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
-    seq[0] = 0;
-    const char *seq_end_ptr = seq + len;
-    char *seq_cursor = seq;
-    uint8_t *nibble_cursor = nib;
-    const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i));
-    __m128i first_upper_shuffle = _mm_setr_epi8(
-        0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1);
-    __m128i first_lower_shuffle = _mm_setr_epi8(
-        -1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7);
-    __m128i second_upper_shuffle = _mm_setr_epi8(
-        8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1);
-    __m128i second_lower_shuffle = _mm_setr_epi8(
-        -1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15);
-    __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
-    /* Work on 16 encoded characters at the time resulting in 32 decoded characters
-       Examples are given for 8 encoded characters A until H to keep it readable.
-        Encoded stored as |AB|CD|EF|GH|
-        Shuffle into |AB|00|CD|00|EF|00|GH|00| and
-                     |00|AB|00|CD|00|EF|00|GH|
-        Shift upper to the right resulting into
-                     |0A|B0|0C|D0|0E|F0|0G|H0| and
-                     |00|AB|00|CD|00|EF|00|GH|
-        Merge with or resulting into (X stands for garbage)
-                     |0A|XB|0C|XD|0E|XF|0G|XH|
-        Bitwise and with 0b1111 leads to:
-                     |0A|0B|0C|0D|0E|0F|0G|0H|
-        We can use the resulting 4-bit integers as indexes for the shuffle of
-        the nucleotide lookup. */
-    while (seq_cursor < seq_vec_end_ptr) {
-        __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);
-
-        __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
-        __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
-        __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
-        __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
-        __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(15));
-        __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
-        _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);
-
-        __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
-        __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
-        __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
-        __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
-        __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(15));
-        __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
-        _mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides);
-
-        nibble_cursor += sizeof(__m128i);
-        seq_cursor += 2 * sizeof(__m128i);
-    }
-    nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
-}
+#include "simd.c"
 
 static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
 
diff --git a/simd.c b/simd.c
new file mode 100644
index 000000000..65e64cd2d
--- /dev/null
+++ b/simd.c
@@ -0,0 +1,64 @@
+#include "immintrin.h"
+/*
+ * Convert a nibble encoded BAM sequence to a string of bases.
+ *
+ * Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be
+ * unpacked into 32 indexes from 0-15. Using the pshufb instruction these can
+ * be converted to the IUPAC characters.
+ * It falls back on the nibble2base_default function for the remainder.
+ */
+
+__attribute__((target("ssse3")))
+static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
+    seq[0] = 0;
+    const char *seq_end_ptr = seq + len;
+    char *seq_cursor = seq;
+    uint8_t *nibble_cursor = nib;
+    const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i));
+    __m128i first_upper_shuffle = _mm_setr_epi8(
+        0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1);
+    __m128i first_lower_shuffle = _mm_setr_epi8(
+        -1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7);
+    __m128i second_upper_shuffle = _mm_setr_epi8(
+        8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1);
+    __m128i second_lower_shuffle = _mm_setr_epi8(
+        -1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15);
+    __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
+    /* Work on 16 encoded characters at the time resulting in 32 decoded characters
+       Examples are given for 8 encoded characters A until H to keep it readable.
+        Encoded stored as |AB|CD|EF|GH|
+        Shuffle into |AB|00|CD|00|EF|00|GH|00| and
+                     |00|AB|00|CD|00|EF|00|GH|
+        Shift upper to the right resulting into
+                     |0A|B0|0C|D0|0E|F0|0G|H0| and
+                     |00|AB|00|CD|00|EF|00|GH|
+        Merge with or resulting into (X stands for garbage)
+                     |0A|XB|0C|XD|0E|XF|0G|XH|
+        Bitwise and with 0b1111 leads to:
+                     |0A|0B|0C|0D|0E|0F|0G|0H|
+        We can use the resulting 4-bit integers as indexes for the shuffle of
+        the nucleotide lookup. */
+    while (seq_cursor < seq_vec_end_ptr) {
+        __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);
+
+        __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
+        __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
+        __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
+        __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
+        __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(15));
+        __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
+        _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);
+
+        __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
+        __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
+        __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
+        __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
+        __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(15));
+        __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
+        _mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides);
+
+        nibble_cursor += sizeof(__m128i);
+        seq_cursor += 2 * sizeof(__m128i);
+    }
+    nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
+}

From efefb5397bd249f7e52858ffc6328331ad3920a0 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Thu, 16 May 2024 18:10:51 +0100
Subject: [PATCH 33/76] Move nibble2base_resolve() infrastructure to simd.c

Also add copyright boilerplate to the new file. [Commit marked as
authored by RMD so git blame attributes the boilerplate and moved
code to Rob, as the latter is unchanged from the original code. -JM]
---
 sam_internal.h | 10 ----------
 simd.c         | 31 +++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/sam_internal.h b/sam_internal.h
index 00be2c215..32c3bff97 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -103,16 +103,6 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
     && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
     && HAVE_BUILTIN_CPU_SUPPORT_SSSE3
 #include "simd.c"
-
-static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
-
-__attribute__((constructor))
-static void nibble2base_resolve(void) {
-    if (__builtin_cpu_supports("ssse3")) {
-        nibble2base = nibble2base_ssse3;
-    }
-}
-
 #else
 static inline void nibble2base(uint8_t *nib, char *seq, int len) {
     nibble2base_default(nib, seq, len);
diff --git a/simd.c b/simd.c
index 65e64cd2d..b95d0a930 100644
--- a/simd.c
+++ b/simd.c
@@ -1,3 +1,25 @@
+/*  simd.c -- SIMD optimised versions of various internal functions.
+
+    Copyright (C) 2024 Genome Research Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
 #include "immintrin.h"
 /*
  * Convert a nibble encoded BAM sequence to a string of bases.
@@ -62,3 +84,12 @@ static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
     }
     nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
 }
+
+static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
+
+__attribute__((constructor))
+static void nibble2base_resolve(void) {
+    if (__builtin_cpu_supports("ssse3")) {
+        nibble2base = nibble2base_ssse3;
+    }
+}

From 9b257376c75217e601287c56590f9b164535f8aa Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Fri, 17 May 2024 21:47:02 +1200
Subject: [PATCH 34/76] Complete the refactoring of nibble2base_ssse3() as an
 ordinary source file

Build simd.c as a source file, so there is one copy of nibble2base_ssse3()
rather than potentially one per translation unit that uses it.

When everything was static, the names needed no prefix. However now that
there is a non-static function pointer, it needs an htslib-specific
prefix to avoid polluting the namespace. (In libhts.so it will usually
be a non-visible symbol, but in libhts.a it needs a prefix.)
---
 Makefile       |  2 ++
 sam_internal.h | 12 +++++++++---
 simd.c         | 25 ++++++++++++++++++++++---
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 8e2cccdfc..f7b68431f 100644
--- a/Makefile
+++ b/Makefile
@@ -209,6 +209,7 @@ LIBHTS_OBJS = \
 	region.o \
 	sam.o \
 	sam_mods.o \
+	simd.o \
 	synced_bcf_reader.o \
 	vcf_sweep.o \
 	tbx.o \
@@ -458,6 +459,7 @@ hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c
 vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h)
 sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h)
 sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h)
+simd.o simd.pico: simd.c config.h $(htslib_sam_h) $(sam_internal_h)
 tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h)
 faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h)
 bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h)
diff --git a/sam_internal.h b/sam_internal.h
index 32c3bff97..505dc9979 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -102,12 +102,18 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
 #if HTS_BUILD_IS_X86_64 \
     && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
     && HAVE_BUILTIN_CPU_SUPPORT_SSSE3
-#include "simd.c"
-#else
+#define BUILDING_SIMD_NIBBLE2BASE
+#endif
+
 static inline void nibble2base(uint8_t *nib, char *seq, int len) {
+#ifdef BUILDING_SIMD_NIBBLE2BASE
+    extern void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len);
+    htslib_nibble2base(nib, seq, len);
+#else
     nibble2base_default(nib, seq, len);
-}
 #endif
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/simd.c b/simd.c
index b95d0a930..539677762 100644
--- a/simd.c
+++ b/simd.c
@@ -20,7 +20,16 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.  */
 
+#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
+#include <config.h>
+
+#include "htslib/sam.h"
+#include "sam_internal.h"
+
 #include "immintrin.h"
+
+#ifdef BUILDING_SIMD_NIBBLE2BASE
+
 /*
  * Convert a nibble encoded BAM sequence to a string of bases.
  *
@@ -31,7 +40,7 @@ DEALINGS IN THE SOFTWARE.  */
  */
 
 __attribute__((target("ssse3")))
-static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
+static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
     seq[0] = 0;
     const char *seq_end_ptr = seq + len;
     char *seq_cursor = seq;
@@ -85,11 +94,21 @@ static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
     nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
 }
 
-static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
+void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
 
 __attribute__((constructor))
 static void nibble2base_resolve(void) {
     if (__builtin_cpu_supports("ssse3")) {
-        nibble2base = nibble2base_ssse3;
+        htslib_nibble2base = nibble2base_ssse3;
     }
 }
+
+#endif // BUILDING_SIMD_NIBBLE2BASE
+
+// Potentially useful diagnostic, and prevents "empty translation unit" errors
+const char htslib_simd[] =
+    "SIMD functions present:"
+#ifdef BUILDING_SIMD_NIBBLE2BASE
+    " nibble2base"
+#endif
+    ".";

From 165a18a6c45072d91696c485740e853b0323fcdd Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Sun, 19 May 2024 10:09:23 +1200
Subject: [PATCH 35/76] Move __attribute__((target)) check to configure too

Configure checks for __builtin_cpu_supports("ssse3"), so the check
for __attribute__((target)) might as well be there too. This enables
this internal-only infrastructure to be removed from htslib/hts_defs.h,
which is primarily for defines needed in the public headers.

Add a configure check for __attribute__((constructor)) too, as we will
likely be using it on other SIMD-using platforms as well.
---
 Makefile          |  2 ++
 configure.ac      | 24 ++++++++++++++++++++++++
 htslib/hts_defs.h | 18 ------------------
 sam_internal.h    |  7 +++----
 4 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/Makefile b/Makefile
index f7b68431f..f7c898826 100644
--- a/Makefile
+++ b/Makefile
@@ -298,6 +298,8 @@ config.h:
 	    echo '#define HAVE_AVX512 1' >> $@ ; \
 	fi
 	echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@
+	echo '#define HAVE_ATTRIBUTE_CONSTRUCTOR 1' >> $@
+	echo '#define HAVE_ATTRIBUTE_TARGET 1' >> $@
 	echo '#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1' >> $@
 	echo '#endif' >> $@
 
diff --git a/configure.ac b/configure.ac
index 13d91d218..cc2b0023d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -173,6 +173,21 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([],[
   AC_MSG_RESULT([no])
 ])
 
+dnl Check for function attribute used in conjunction with __builtin_cpu_supports
+AC_MSG_CHECKING([for __attribute__((target))])
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+  __attribute__((target("ssse3")))
+  int zero(void) {
+    return 0;
+  }
+]], [[zero();]])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_ATTRIBUTE_TARGET], 1,
+            [Define if __attribute__((target(...))) is available.])
+], [
+  AC_MSG_RESULT([no])
+])
+
 ]) dnl End of AS_IF(hts_have_cpuid)
 
 dnl Avoid chicken-and-egg problem where pkg-config supplies the
@@ -316,6 +331,15 @@ AC_CHECK_FUNCS([gmtime_r fsync drand48 srand48_deterministic])
 # Darwin has a dubious fdatasync() symbol, but no declaration in <unistd.h>
 AC_CHECK_DECL([fdatasync(int)], [AC_CHECK_FUNCS(fdatasync)])
 
+AC_MSG_CHECKING([for __attribute__((constructor))])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+  static __attribute__((constructor)) void noop(void) {}
+]], [])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_ATTRIBUTE_CONSTRUCTOR], 1,
+            [Define if __attribute__((constructor)) is available.])
+], [AC_MSG_RESULT([no])])
+
 if test $enable_plugins != no; then
   AC_SEARCH_LIBS([dlsym], [dl], [],
     [MSG_ERROR([dlsym() not found
diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h
index 0c5f8957a..e714e8fda 100644
--- a/htslib/hts_defs.h
+++ b/htslib/hts_defs.h
@@ -34,10 +34,6 @@ DEALINGS IN THE SOFTWARE.  */
 #define HTS_COMPILER_HAS(attribute) __has_attribute(attribute)
 #endif
 
-#ifdef __has_builtin
-#define HTS_COMPILER_HAS_BUILTIN(function) __has_builtin(function)
-#endif
-
 #elif defined __GNUC__
 #define HTS_GCC_AT_LEAST(major, minor) \
     (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
@@ -46,10 +42,6 @@ DEALINGS IN THE SOFTWARE.  */
 #ifndef HTS_COMPILER_HAS
 #define HTS_COMPILER_HAS(attribute) 0
 #endif
-#ifndef HTS_COMPILER_HAS_BUILTIN
-#define HTS_COMPILER_HAS_BUILTIN(function) 0
-#endif
-
 #ifndef HTS_GCC_AT_LEAST
 #define HTS_GCC_AT_LEAST(major, minor) 0
 #endif
@@ -126,16 +118,6 @@ DEALINGS IN THE SOFTWARE.  */
 #define HTS_FORMAT(type, idx, first)
 #endif
 
-#define HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
- ((HTS_COMPILER_HAS(target) && HTS_COMPILER_HAS_BUILTIN(__builtin_cpu_supports)) \
- || HTS_GCC_AT_LEAST(4, 8))
-
-#if (defined(__x86_64__) || defined(_M_X64))
-#define HTS_BUILD_IS_X86_64 1
-#else
-#define HTS_BUILD_IS_X86_64 0
-#endif
-
 #if defined(_WIN32) || defined(__CYGWIN__)
 #if defined(HTS_BUILDING_LIBRARY)
 #define HTSLIB_EXPORT __declspec(dllexport)
diff --git a/sam_internal.h b/sam_internal.h
index 505dc9979..17a6b794e 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -25,7 +25,7 @@ DEALINGS IN THE SOFTWARE.  */
 
 #include <errno.h>
 #include <stdint.h>
-#include "htslib/hts_defs.h"
+
 #include "htslib/sam.h"
 
 #ifdef __cplusplus
@@ -99,9 +99,8 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
         seq[i] = seq_nt16_str[bam_seqi(nib, i)];
 }
 
-#if HTS_BUILD_IS_X86_64 \
-    && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
-    && HAVE_BUILTIN_CPU_SUPPORT_SSSE3
+#if defined HAVE_ATTRIBUTE_CONSTRUCTOR && \
+    defined __x86_64__ && defined HAVE_ATTRIBUTE_TARGET && defined HAVE_BUILTIN_CPU_SUPPORT_SSSE3
 #define BUILDING_SIMD_NIBBLE2BASE
 #endif
 

From c13783110bf13e4a9a0fd6a44def9e770a8e3ef9 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Sun, 2 Jun 2024 23:39:21 +1200
Subject: [PATCH 36/76] Minor improvements to Intel implementation

Notably the adjustment to seq_vec_end_ptr stops it from reverting
to the scalar code prematurely: at present it drops back to scalar
when <=32 entries remain, rather than <=31.
---
 simd.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/simd.c b/simd.c
index 539677762..33a053ecb 100644
--- a/simd.c
+++ b/simd.c
@@ -26,7 +26,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/sam.h"
 #include "sam_internal.h"
 
-#include "immintrin.h"
+#include <immintrin.h>
 
 #ifdef BUILDING_SIMD_NIBBLE2BASE
 
@@ -41,11 +41,10 @@ DEALINGS IN THE SOFTWARE.  */
 
 __attribute__((target("ssse3")))
 static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
-    seq[0] = 0;
     const char *seq_end_ptr = seq + len;
     char *seq_cursor = seq;
     uint8_t *nibble_cursor = nib;
-    const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i));
+    const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i) - 1);
     __m128i first_upper_shuffle = _mm_setr_epi8(
         0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1);
     __m128i first_lower_shuffle = _mm_setr_epi8(

From e090ef65df75fd16724d866b4538a3ed5bd7d14d Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Sun, 23 Jun 2024 20:32:11 +1200
Subject: [PATCH 37/76] Add correctness and speed tests for nibble2base()
 routines

---
 .gitignore          |   1 +
 Makefile            |   6 ++
 configure.ac        |  10 +++
 test/test_nibbles.c | 164 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 181 insertions(+)
 create mode 100644 test/test_nibbles.c

diff --git a/.gitignore b/.gitignore
index 9b7e26f71..87496ade8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,6 +69,7 @@ shlib-exports-*.txt
 /test/test_kfunc
 /test/test_kstring
 /test/test_mod
+/test/test_nibbles
 /test/test-parse-reg
 /test/test_realn
 /test/test-regidx
diff --git a/Makefile b/Makefile
index f7c898826..17d1baf97 100644
--- a/Makefile
+++ b/Makefile
@@ -87,6 +87,7 @@ BUILT_TEST_PROGRAMS = \
 	test/test_kfunc \
 	test/test_kstring \
 	test/test_mod \
+	test/test_nibbles \
 	test/test_realn \
 	test/test-regidx \
 	test/test_str2int \
@@ -603,6 +604,7 @@ check test: all $(HTSCODECS_TEST_TARGETS)
 	test/test_expr
 	test/test_kfunc
 	test/test_kstring
+	test/test_nibbles -v
 	test/test_str2int
 	test/test_time_funcs
 	test/fieldarith test/fieldarith.sam
@@ -671,6 +673,9 @@ test/test_kstring: test/test_kstring.o libhts.a
 test/test_mod: test/test_mod.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_mod.o libhts.a $(LIBS) -lpthread
 
+test/test_nibbles: test/test_nibbles.o libhts.a
+	$(CC) $(LDFLAGS) -o $@ test/test_nibbles.o libhts.a $(LIBS) -lpthread
+
 test/test_realn: test/test_realn.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread
 
@@ -773,6 +778,7 @@ test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h)
 test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h)
 test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h)
 test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h)
+test/test_nibbles.o: test/test_nibbles.c config.h $(htslib_sam_h) $(sam_internal_h)
 test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h)
 test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h)
 test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h)
diff --git a/configure.ac b/configure.ac
index cc2b0023d..19b48b5e3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -340,6 +340,16 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([[
             [Define if __attribute__((constructor)) is available.])
 ], [AC_MSG_RESULT([no])])
 
+AC_MSG_CHECKING([for clock_gettime with CLOCK_PROCESS_CPUTIME_ID])
+AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <time.h>]], [[
+  struct timespec ts;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+]])], [
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_CLOCK_GETTIME_CPUTIME], 1,
+            [Define if clock_gettime exists and accepts CLOCK_PROCESS_CPUTIME_ID.])
+], [AC_MSG_RESULT([no])])
+
 if test $enable_plugins != no; then
   AC_SEARCH_LIBS([dlsym], [dl], [],
     [MSG_ERROR([dlsym() not found
diff --git a/test/test_nibbles.c b/test/test_nibbles.c
new file mode 100644
index 000000000..1ef3456ea
--- /dev/null
+++ b/test/test_nibbles.c
@@ -0,0 +1,164 @@
+/*  test/test_nibbles.c -- Test SIMD optimised function implementations.
+
+    Copyright (C) 2024 Centre for Population Genomics.
+
+    Author: John Marshall <jmarshall@hey.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+#include <time.h>
+#else
+#include <sys/time.h>
+#endif
+
+#include "../htslib/sam.h"
+#include "../sam_internal.h"
+
+long long gettime(void) {
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+    struct timespec ts;
+    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+    return ts.tv_sec * 1000000000LL + ts.tv_nsec;
+#else
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000LL + tv.tv_usec;
+#endif
+}
+
+char *fmttime(long long elapsed) {
+    static char buf[64];
+
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+    long long sec = elapsed / 1000000000;
+    long long nsec = elapsed % 1000000000;
+    sprintf(buf, "%lld.%09lld processor seconds", sec, nsec);
+#else
+    long long sec = elapsed / 1000000;
+    long long usec = elapsed % 1000000;
+    sprintf(buf, "%lld.%06lld wall-time seconds", sec, usec);
+#endif
+
+    return buf;
+}
+
+void nibble2base_single(uint8_t *nib, char *seq, int len) {
+    int i;
+    for (i = 0; i < len; i++)
+        seq[i] = seq_nt16_str[bam_seqi(nib, i)];
+}
+
+unsigned char nibble[5000];
+char buf[10000];
+
+int validate_nibble2base(void) {
+    char defbuf[500];
+    int i, start, len;
+    unsigned long long total = 0, failed = 0;
+
+    for (i = 0; i < sizeof nibble; i++)
+        nibble[i] = i % 256;
+
+    for (start = 0; start < 80; start++)
+        for (len = 0; len < 400; len++) {
+            memset(defbuf, '\0', sizeof defbuf);
+            nibble2base_single(&nibble[start], defbuf, len);
+
+            memset(buf, '\0', sizeof defbuf);
+            nibble2base(&nibble[start], buf, len);
+
+            total++;
+            if (strcmp(defbuf, buf) != 0) {
+                printf("%s expected\n%s FAIL\n\n", defbuf, buf);
+                failed++;
+            }
+        }
+
+    if (failed > 0) {
+        fprintf(stderr, "Failures: %llu (out of %llu tests)\n", failed, total);
+        return 1;
+    }
+
+    return 0;
+}
+
+int time_nibble2base(int length, unsigned long count) {
+    unsigned long i, total = 0;
+
+    for (i = 0; i < length; i++)
+        nibble[i] = i % 256;
+
+    printf("Timing %lu nibble2base iterations with read length %d...\n", count, length);
+    long long start = gettime();
+
+    for (i = 0; i < count; i++) {
+        nibble2base(nibble, buf, length);
+        total += buf[i % length];
+    }
+
+    long long stop = gettime();
+    printf("%s (summing to %lu)\n", fmttime(stop - start), total);
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    int readlen = 5000;
+    unsigned long count = 1000000;
+    int status = 0;
+    int c;
+
+    if (argc == 1)
+        printf(
+"Usage: test_nibbles [-c NUM] [-r NUM] [-n|-v]...\n"
+"Options:\n"
+"  -c NUM  Specify number of iterations [%lu]\n"
+"  -n      Run nibble2base speed tests\n"
+"  -r NUM  Specify read length [%d]\n"
+"  -v      Run all validation tests\n"
+"", count, readlen);
+
+    while ((c = getopt(argc, argv, "c:nr:v")) >= 0)
+        switch (c) {
+        case 'c':
+            count = strtoul(optarg, NULL, 0);
+            break;
+
+        case 'n':
+            status += time_nibble2base(readlen, count);
+            break;
+
+        case 'r':
+            readlen = atoi(optarg);
+            break;
+
+        case 'v':
+            status += validate_nibble2base();
+            break;
+        }
+
+    return status;
+}

From 256451f8610facbc97abe2e5df89335a72d57b43 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Sun, 23 Jun 2024 22:05:59 +1200
Subject: [PATCH 38/76] Add ARM Neon nibble2base_neon() implementation

As ARM CPU capability instructions are privileged, we also need to
add cpu_supports_neon() and implement it to query for Neon/AdvSIMD
in various platform-dependent ways. (On 32-bit ARM, glibc helpfully
renamed the HWCAP_* macros to HWCAP_ARM_*, so accept both on Linux.)

32-bit ARM GCC erroneously does not define vst1q_u8_x2() (bug 71233,
fixed in v14.1 in this case), so avoid it on 32-bit ARM by writing
interleaved via vst2q_u8() instead.
---
 sam_internal.h |   3 +-
 simd.c         | 135 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 135 insertions(+), 3 deletions(-)

diff --git a/sam_internal.h b/sam_internal.h
index 17a6b794e..8f701f337 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -100,7 +100,8 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
 }
 
 #if defined HAVE_ATTRIBUTE_CONSTRUCTOR && \
-    defined __x86_64__ && defined HAVE_ATTRIBUTE_TARGET && defined HAVE_BUILTIN_CPU_SUPPORT_SSSE3
+    ((defined __x86_64__ && defined HAVE_ATTRIBUTE_TARGET && defined HAVE_BUILTIN_CPU_SUPPORT_SSSE3) || \
+     (defined __ARM_NEON))
 #define BUILDING_SIMD_NIBBLE2BASE
 #endif
 
diff --git a/simd.c b/simd.c
index 33a053ecb..ba3edea7b 100644
--- a/simd.c
+++ b/simd.c
@@ -23,13 +23,91 @@ DEALINGS IN THE SOFTWARE.  */
 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
 #include <config.h>
 
+// These must be defined before the first system include to ensure that legacy
+// BSD types needed by <sys/sysctl.h> remain defined when _XOPEN_SOURCE is set.
+#if defined __APPLE__
+#define _DARWIN_C_SOURCE
+#elif defined __NetBSD__
+#define _NETBSD_SOURCE
+#endif
+
 #include "htslib/sam.h"
 #include "sam_internal.h"
 
+#if defined __x86_64__
 #include <immintrin.h>
+#elif defined __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined __arm__ || defined __aarch64__
+
+#if defined __linux__ || defined __FreeBSD__
+#include <sys/auxv.h>
+#elif defined __APPLE__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif defined __NetBSD__
+#include <stddef.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#ifdef __aarch64__
+#include <aarch64/armreg.h>
+#else
+#include <arm/armreg.h>
+#endif
+#elif defined _WIN32
+#include <processthreadsapi.h>
+#endif
+
+static inline int cpu_supports_neon(void) {
+#if defined __linux__ && defined __arm__ && defined HWCAP_NEON
+    return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0;
+#elif defined __linux__ && defined __arm__ && defined HWCAP_ARM_NEON
+    return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0;
+#elif defined __linux__ && defined __aarch64__ && defined HWCAP_ASIMD
+    return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0;
+#elif defined __APPLE__ && defined __aarch64__
+    int32_t ctl;
+    size_t ctlsize = sizeof ctl;
+    if (sysctlbyname("hw.optional.AdvSIMD", &ctl, &ctlsize, NULL, 0) != 0) return 0;
+    if (ctlsize != sizeof ctl) return 0;
+    return ctl;
+#elif defined __FreeBSD__ && defined __arm__ && defined HWCAP_NEON
+    unsigned long cap;
+    if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
+    return (cap & HWCAP_NEON) != 0;
+#elif defined __FreeBSD__ && defined __aarch64__ && defined HWCAP_ASIMD
+    unsigned long cap;
+    if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0;
+    return (cap & HWCAP_ASIMD) != 0;
+#elif defined __NetBSD__ && defined __arm__ && defined ARM_MVFR0_ASIMD_MASK
+    uint32_t buf[16];
+    size_t buflen = sizeof buf;
+    if (sysctlbyname("machdep.id_mvfr", buf, &buflen, NULL, 0) != 0) return 0;
+    if (buflen < sizeof(uint32_t)) return 0;
+    return (buf[0] & ARM_MVFR0_ASIMD_MASK) == 0x00000002;
+#elif defined __NetBSD__ && defined __aarch64__ && defined ID_AA64PFR0_EL1_ADVSIMD
+    struct aarch64_sysctl_cpu_id buf;
+    size_t buflen = sizeof buf;
+    if (sysctlbyname("machdep.cpu0.cpu_id", &buf, &buflen, NULL, 0) != 0) return 0;
+    if (buflen < offsetof(struct aarch64_sysctl_cpu_id, ac_aa64pfr0) + sizeof(uint64_t)) return 0;
+    return (buf.ac_aa64pfr0 & ID_AA64PFR0_EL1_ADVSIMD & 0x00e00000) == 0;
+#elif defined _WIN32
+    return IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) != 0;
+#else
+    return 0;
+#endif
+}
+
+#endif
 
 #ifdef BUILDING_SIMD_NIBBLE2BASE
 
+void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
+
+#if defined __x86_64__
+
 /*
  * Convert a nibble encoded BAM sequence to a string of bases.
  *
@@ -93,8 +171,6 @@ static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
     nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
 }
 
-void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;
-
 __attribute__((constructor))
 static void nibble2base_resolve(void) {
     if (__builtin_cpu_supports("ssse3")) {
@@ -102,6 +178,61 @@ static void nibble2base_resolve(void) {
     }
 }
 
+#elif defined __ARM_NEON
+
+static void nibble2base_neon(uint8_t *nib, char *seq0, int len) {
+    uint8x16_t low_nibbles_mask = vdupq_n_u8(0x0f);
+    uint8x16_t nuc_lookup_vec = vld1q_u8((const uint8_t *) seq_nt16_str);
+#ifndef __aarch64__
+    uint8x8x2_t nuc_lookup_vec2 = {{ vget_low_u8(nuc_lookup_vec), vget_high_u8(nuc_lookup_vec) }};
+#endif
+
+    uint8_t *seq = (uint8_t *) seq0;
+    int blocks;
+
+    for (blocks = len / 32; blocks > 0; --blocks) {
+        uint8x16_t encoded = vld1q_u8(nib);
+        nib += 16;
+
+        /* Translate the high and low nibbles to nucleotide letters separately,
+           then interleave them back together via vzipq for writing. */
+
+        uint8x16_t high_nibbles = vshrq_n_u8(encoded, 4);
+        uint8x16_t low_nibbles  = vandq_u8(encoded, low_nibbles_mask);
+
+#ifdef __aarch64__
+        uint8x16_t high_nucleotides = vqtbl1q_u8(nuc_lookup_vec, high_nibbles);
+        uint8x16_t low_nucleotides  = vqtbl1q_u8(nuc_lookup_vec, low_nibbles);
+#else
+        uint8x8_t high_low  = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(high_nibbles));
+        uint8x8_t high_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(high_nibbles));
+        uint8x16_t high_nucleotides = vcombine_u8(high_low, high_high);
+
+        uint8x8_t low_low  = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(low_nibbles));
+        uint8x8_t low_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(low_nibbles));
+        uint8x16_t low_nucleotides = vcombine_u8(low_low, low_high);
+#endif
+
+#ifdef __aarch64__
+        vst1q_u8_x2(seq, vzipq_u8(high_nucleotides, low_nucleotides));
+#else
+        // Avoid vst1q_u8_x2 as GCC erroneously omits it on 32-bit ARM
+        uint8x16x2_t nucleotides = {{ high_nucleotides, low_nucleotides }};
+        vst2q_u8(seq, nucleotides);
+#endif
+        seq += 32;
+    }
+
+    if (len % 32 != 0)
+        nibble2base_default(nib, (char *) seq, len % 32);
+}
+
+static __attribute__((constructor)) void nibble2base_resolve(void) {
+    if (cpu_supports_neon()) htslib_nibble2base = nibble2base_neon;
+}
+
+#endif
+
 #endif // BUILDING_SIMD_NIBBLE2BASE
 
 // Potentially useful diagnostic, and prevents "empty translation unit" errors

From 1b4cda656cd8e96420144f5dbe2119d2a37a6265 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Mon, 1 Jul 2024 23:54:09 +1200
Subject: [PATCH 39/76] Define HAVE_ATTRIBUTE_CONSTRUCTOR on ARM too for
 non-configure builds

---
 Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 17d1baf97..75b570a34 100644
--- a/Makefile
+++ b/Makefile
@@ -298,8 +298,10 @@ config.h:
 	if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \
 	    echo '#define HAVE_AVX512 1' >> $@ ; \
 	fi
-	echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@
+	echo '#if defined __x86_64__ || defined __arm__ || defined __aarch64__' >> $@
 	echo '#define HAVE_ATTRIBUTE_CONSTRUCTOR 1' >> $@
+	echo '#endif' >> $@
+	echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@
 	echo '#define HAVE_ATTRIBUTE_TARGET 1' >> $@
 	echo '#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1' >> $@
 	echo '#endif' >> $@

From 624e95b874661d93cdd572ef83c7cfdf58351b88 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 24 Jun 2024 11:50:11 +0100
Subject: [PATCH 40/76] Add Windows CI via github actions

Also amend .gitattributes file for more Windows text-mode removal as
for some reason the GitHub Actions git is much more likely to use
CR-LF line endings.

Added libdeflate to the windows build.

Corrected test/header_syms.pl to work on windows line endings.
---
 .gitattributes                      | 11 ++++++++
 .github/workflows/windows-build.yml | 40 +++++++++++++++++++++++++++++
 test/header_syms.pl                 |  1 +
 3 files changed, 52 insertions(+)
 create mode 100644 .github/workflows/windows-build.yml

diff --git a/.gitattributes b/.gitattributes
index 5d9850bc7..2d5a80e04 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -24,3 +24,14 @@ test/index_dos.sam -text
 # Remove the text attribute from various faidx test files
 test/faidx/faidx*.fa* -text
 test/faidx/fastqs*.fq* -text
+test/fastq/*.fa -text
+test/fastq/*.fq -text
+*.tst -text
+*.out -text
+*.crai    -text
+*.bai     -text
+*.csi     -text
+*.gzi     -text
+*.bcf     -text
+*.sam     -text
+*.sam.gz  -text
diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
new file mode 100644
index 000000000..c9c18461b
--- /dev/null
+++ b/.github/workflows/windows-build.yml
@@ -0,0 +1,40 @@
+name: Windows/MinGW-W64 CI
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: windows-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Set up MSYS2 MinGW-W64
+      uses: msys2/setup-msys2@v2
+      with:
+        msystem: mingw64
+        update: false
+        install: >-
+          git
+          zlib-devel
+          libbz2-devel
+          liblzma-devel
+          mingw-w64-x86_64-toolchain
+          mingw-w64-x86_64-autotools
+          mingw-w64-x86_64-tools-git
+          mingw-w64-x86_64-libdeflate
+    - name: Compile htslib
+      shell: msys2 {0}
+      run: |
+        export PATH=/mingw64/bin:$PATH
+        export MSYSTEM=MINGW64
+        autoreconf -i
+        ./configure
+        make -j6
+    - name: Check Htslib
+      shell: msys2 {0}
+      run: |
+        export PATH=/mingw64/bin:$PATH
+        export MSYSTEM=MINGW64
+        make test-shlib-exports && make check
+
diff --git a/test/header_syms.pl b/test/header_syms.pl
index fe5128a78..a8d4a885c 100755
--- a/test/header_syms.pl
+++ b/test/header_syms.pl
@@ -60,6 +60,7 @@ sub extract_symbols {
 
     open(my $f, '<', $file) || die "Couldn't open $file : $!\n";
     my $text = <$f>;
+    $text =~ tr/\r//d;
     close($f) || die "Error reading $file : $!\n";
 
     # Get rid of comments

From b8145e606af125c7e17b5c9a8803a036969b1bf7 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Mon, 24 Jun 2024 17:19:46 +0100
Subject: [PATCH 41/76] Update htscodecs to include github actions branch

---
 htscodecs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htscodecs b/htscodecs
index 5a2627ec4..16548914a 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit 5a2627ec4d6bf0c96b0d7e85a0b031e3ce80f8c1
+Subproject commit 16548914ada64cf77acd7c64562b085ed1a4ccd9

From f3d401cc6cf5348c7f1dc8ebb9c1ce4d5001fd46 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 27 Jun 2024 13:38:14 +0100
Subject: [PATCH 42/76] Speed up faidx.

- bgzf_getc is slow as it's a heavy function and not inlined.  Most of
  the time though it's just an array fetch, so inline the basic form and
  revert to the function call for the complex form.

- isgraph and all other ctype functions are slow.  We assume ASCII and
  just do a naive implementation.

The speed benefits are (seconds):

                      Old     New
    Index GRCh38      13.4    8.4
    Query chr1	       1.7    0.9

Given a significant speed change for a small localised modification it
seems worth while having.
---
 faidx.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/faidx.c b/faidx.c
index ce8fe5d9f..a45bdb125 100644
--- a/faidx.c
+++ b/faidx.c
@@ -43,6 +43,29 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/kstring.h"
 #include "hts_internal.h"
 
+// Faster isgraph; assumes ASCII
+static inline int isgraph_(unsigned char c) {
+    return c > ' ' && c <= '~';
+}
+
+#ifdef isgraph
+#  undef isgraph
+#endif
+#define isgraph isgraph_
+
+// An optimised bgzf_getc.
+// We could consider moving this to bgzf.h, but our own code uses it here only.
+static inline int bgzf_getc_(BGZF *fp) {
+    if (fp->block_offset+1 < fp->block_length) {
+        int c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++];
+        fp->uncompressed_address++;
+        return c;
+    }
+
+    return bgzf_getc(fp);
+}
+#define bgzf_getc bgzf_getc_
+
 typedef struct {
     int id; // faidx_t->name[id] is for this struct.
     uint32_t line_len, line_blen;
@@ -727,7 +750,8 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
         return NULL;
     }
 
-    while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 )
+    BGZF *fp = fai->bgzf;
+    while ( l < end - beg && (c=bgzf_getc(fp))>=0 )
         if (isgraph(c)) s[l++] = c;
     if (c < 0) {
         hts_log_error("Failed to retrieve block: %s",

From 6012472cc0c937481fc0c36f9b8bdb8da8c4dbfd Mon Sep 17 00:00:00 2001
From: Ruben Vorderman <r.h.p.vorderman@lumc.nl>
Date: Mon, 8 Jul 2024 07:51:30 +0200
Subject: [PATCH 43/76] Simplify SSSE3 nibble2base function

---
 simd.c | 54 ++++++++++++++++--------------------------------------
 1 file changed, 16 insertions(+), 38 deletions(-)

diff --git a/simd.c b/simd.c
index ba3edea7b..865dd887e 100644
--- a/simd.c
+++ b/simd.c
@@ -123,48 +123,26 @@ static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
     char *seq_cursor = seq;
     uint8_t *nibble_cursor = nib;
     const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i) - 1);
-    __m128i first_upper_shuffle = _mm_setr_epi8(
-        0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1);
-    __m128i first_lower_shuffle = _mm_setr_epi8(
-        -1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7);
-    __m128i second_upper_shuffle = _mm_setr_epi8(
-        8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1);
-    __m128i second_lower_shuffle = _mm_setr_epi8(
-        -1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15);
     __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
-    /* Work on 16 encoded characters at the time resulting in 32 decoded characters
-       Examples are given for 8 encoded characters A until H to keep it readable.
-        Encoded stored as |AB|CD|EF|GH|
-        Shuffle into |AB|00|CD|00|EF|00|GH|00| and
-                     |00|AB|00|CD|00|EF|00|GH|
-        Shift upper to the right resulting into
-                     |0A|B0|0C|D0|0E|F0|0G|H0| and
-                     |00|AB|00|CD|00|EF|00|GH|
-        Merge with or resulting into (X stands for garbage)
-                     |0A|XB|0C|XD|0E|XF|0G|XH|
-        Bitwise and with 0b1111 leads to:
-                     |0A|0B|0C|0D|0E|0F|0G|0H|
-        We can use the resulting 4-bit integers as indexes for the shuffle of
-        the nucleotide lookup. */
+    /* Nucleotides are encoded 4-bits per nucleotide and stored in 8-bit bytes
+       as follows: |AB|CD|EF|GH|. The 4-bit codes (going from 0-15) can be used
+       together with the pshufb instruction as a lookup table. The most efficient
+       way is to use bitwise AND and shift to create two vectors. One with all
+       the upper codes (|A|C|E|G|) and one with the lower codes (|B|D|F|H|).
+       The lookup can then be performed and the resulting vectors can be
+       interleaved again using the unpack instructions. */
     while (seq_cursor < seq_vec_end_ptr) {
         __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);
-
-        __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
-        __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
-        __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
-        __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
-        __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(15));
-        __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
+        __m128i encoded_upper = _mm_srli_epi64(encoded, 4);
+        encoded_upper = _mm_and_si128(encoded_upper, _mm_set1_epi8(15));
+        __m128i encoded_lower = _mm_and_si128(encoded, _mm_set1_epi8(15));
+        __m128i nucs_upper = _mm_shuffle_epi8(nuc_lookup_vec, encoded_upper);
+        __m128i nucs_lower = _mm_shuffle_epi8(nuc_lookup_vec, encoded_lower);
+        __m128i first_nucleotides = _mm_unpacklo_epi8(nucs_upper, nucs_lower);
+        __m128i second_nucleotides = _mm_unpackhi_epi8(nucs_upper, nucs_lower);
         _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);
-
-        __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
-        __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
-        __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
-        __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
-        __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(15));
-        __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
-        _mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides);
-
+        _mm_storeu_si128((__m128i *)(seq_cursor + sizeof(__m128i)),
+                         second_nucleotides);
         nibble_cursor += sizeof(__m128i);
         seq_cursor += 2 * sizeof(__m128i);
     }

From 41ea68b38d3bc6bc3f42b8d598c80fb52af3b8cc Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Fri, 28 Jun 2024 22:18:45 +1200
Subject: [PATCH 44/76] Read whole lines at once in fai_retrieve()

Because fai_retrieve() is given only well-formatted input containing
lines of the same length, it already knows exactly where the base and
non-graphic characters are. So in general the interval to be read will
look like

    ......ATGCAT    (read last six bases and line terminator)
    ATGCATGCATGC    (read complete line including line terminator)
    ATGCATGCATGC    (read complete line including line terminator)
    ATGC........    (read first four base characters)

and can be read a line at a time instead of a character at a time,
with special handling for the partial first and last lines, and
discarding the terminator characters at the end of each line read.
---
 faidx.c | 65 +++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 18 deletions(-)

diff --git a/faidx.c b/faidx.c
index a45bdb125..2e8968304 100644
--- a/faidx.c
+++ b/faidx.c
@@ -715,9 +715,8 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
 
 static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
                           uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) {
-    char *s;
-    size_t l;
-    int c = 0;
+    char *buffer, *s;
+    ssize_t nread, remaining, firstline_len, firstline_blen;
     int ret;
 
     if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
@@ -743,27 +742,57 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
         return NULL;
     }
 
-    l = 0;
-    s = (char*)malloc((size_t) end - beg + 2);
-    if (!s) {
+    // Over-allocate so there is extra space for one end-of-line sequence
+    buffer = (char*)malloc((size_t) end - beg + val->line_len - val->line_blen + 1);
+    if (!buffer) {
         *len = -1;
         return NULL;
     }
 
-    BGZF *fp = fai->bgzf;
-    while ( l < end - beg && (c=bgzf_getc(fp))>=0 )
-        if (isgraph(c)) s[l++] = c;
-    if (c < 0) {
-        hts_log_error("Failed to retrieve block: %s",
-            c == -1 ? "unexpected end of file" : "error reading file");
-        free(s);
-        *len = -1;
-        return NULL;
+    remaining = *len = end - beg;
+    firstline_blen = val->line_blen - beg % val->line_blen;
+
+    // Special case when the entire interval requested is within a single FASTA/Q line
+    if (remaining <= firstline_blen) {
+        nread = bgzf_read_small(fai->bgzf, buffer, remaining);
+        if (nread < remaining) goto error;
+        buffer[nread] = '\0';
+        return buffer;
+    }
+
+    s = buffer;
+    firstline_len = val->line_len - beg % val->line_blen;
+
+    // Read the (partial) first line and its line terminator, but increment  s  past the
+    // line contents only, so the terminator characters will be overwritten by the next line.
+    nread = bgzf_read_small(fai->bgzf, s, firstline_len);
+    if (nread < firstline_len) goto error;
+    s += firstline_blen;
+    remaining -= firstline_blen;
+
+    // Similarly read complete lines and their line terminator characters, but overwrite the latter.
+    while (remaining > val->line_blen) {
+        nread = bgzf_read_small(fai->bgzf, s, val->line_len);
+        if (nread < (ssize_t) val->line_len) goto error;
+        s += val->line_blen;
+        remaining -= val->line_blen;
     }
 
-    s[l] = '\0';
-    *len = l;
-    return s;
+    if (remaining > 0) {
+        nread = bgzf_read_small(fai->bgzf, s, remaining);
+        if (nread < remaining) goto error;
+        s += remaining;
+    }
+
+    *s = '\0';
+    return buffer;
+
+error:
+    hts_log_error("Failed to retrieve block: %s",
+                  (nread == 0)? "unexpected end of file" : "error reading file");
+    free(buffer);
+    *len = -1;
+    return NULL;
 }
 
 static int fai_get_val(const faidx_t *fai, const char *str,

From 1187fa832998dd5fea9ea2a78bf6863a31c508f9 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 8 Jul 2024 16:18:35 +0100
Subject: [PATCH 45/76] Remove AppVeyor configuration

Completes the migration of Windows testing to GitHub actions.
GitHub actions tests were added in commit 624e95b8 (PR #1796).
---
 .appveyor.yml | 43 -------------------------------------------
 1 file changed, 43 deletions(-)
 delete mode 100644 .appveyor.yml

diff --git a/.appveyor.yml b/.appveyor.yml
deleted file mode 100644
index 8fe288094..000000000
--- a/.appveyor.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-# version format.
-# you can use {branch} name in version format too
-# version: 1.0.{build}-{branch}
-version: 'vers.{build}'
-
-# branches to build
-branches:
-    # Blacklist
-    except:
-      - gh-pages
-
-# Do not build on tags (GitHub and BitBucket)
-skip_tags: true
-
-# Skipping commits affecting specific files (GitHub only). More details here: /docs/appveyor-yml
-#skip_commits:
-#  files:
-#    - docs/*
-#    - '**/*.html'
-
-# Appveyor Windows images are based on Visual studio version
-image: Visual Studio 2019
-
-# We use Mingw/Msys, so use pacman for installs
-install:
-  - set HOME=.
-  - set MSYSTEM=MINGW64
-  - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH%
-  - set MINGWPREFIX=x86_64-w64-mingw32
-  - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-autotools mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-tools-git\""
-
-build_script:
-  - set HOME=.
-  - set MSYSTEM=MINGW64
-  - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH%
-  - git submodule update --init --recursive
-  - "sh -lc \"autoreconf -i && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\""
-
-#build_script:
-#  - make
-
-test_script:
-  - "sh -lc \"make test-shlib-exports && make test\""

From 19a27e959e3110ca32d7c4f3106775ccf2cf11cc Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 9 Jul 2024 09:45:31 +0100
Subject: [PATCH 46/76] Speed up kputll.

The kputuw function is considerably faster as it encodes 2 digits at a
time and also utilises __builtin_clz.  This changes kputll to use the
same 2 digits at a time trick.  I have a __builtin_clzll variant too,
but with longer numbers it's not the main bottleneck and we fall back
to kputuw for small numbers.  This avoids complicating the code with
builtin checks and alternate versions.

An alternative, purely for sam_format1_append would be something like:

    static inline int kputll_fast(long long c, kstring_t *s) {
        return c <= INT_MAX && c >= INT_MIN ? kputw(c, s) : kputll(c, s);
    }
    #define kputll kputll_fast

This works as BAM/CRAM only support 32-bit numbers for POS, PNEXT and
TLEN anyway, so ll vs w is an irrelevant distinction.  However I chose
to modify the header file so it fixes other callers.

Overall compressed BAM to uncompressed SAM conversion is about 5%
quicker (tested on 10 million short-read seqs; it'll be minimal on
long seqs).  This includes decode time and other functions too.  The
sam_format1_append only component of that is about 15-25% quicker
depending on compiler and version.
---
 htslib/kstring.h    | 68 +++++++++++++++++++++++++++++++------
 test/test_kstring.c | 81 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+), 11 deletions(-)

diff --git a/htslib/kstring.h b/htslib/kstring.h
index 53a19806d..0a3efb7d2 100644
--- a/htslib/kstring.h
+++ b/htslib/kstring.h
@@ -375,17 +375,63 @@ static inline int kputw(int c, kstring_t *s)
 
 static inline int kputll(long long c, kstring_t *s)
 {
-	char buf[32];
-	int i, l = 0;
-	unsigned long long x = c;
-	if (c < 0) x = -x;
-	do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0);
-	if (c < 0) buf[l++] = '-';
-	if (ks_resize(s, s->l + l + 2) < 0)
-		return EOF;
-	for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i];
-	s->s[s->l] = 0;
-	return 0;
+    // Worst case expansion.  One check reduces function size
+    // and aids inlining chance.  Memory overhead is minimal.
+    if (ks_resize(s, s->l + 23) < 0)
+	return EOF;
+
+    unsigned long long x = c;
+    if (c < 0) {
+	x = -x;
+        s->s[s->l++] = '-';
+    }
+
+    if (x <= UINT32_MAX)
+	return kputuw(x, s);
+
+    static const char kputull_dig2r[] =
+        "00010203040506070809"
+        "10111213141516171819"
+        "20212223242526272829"
+        "30313233343536373839"
+        "40414243444546474849"
+        "50515253545556575859"
+        "60616263646566676869"
+        "70717273747576777879"
+        "80818283848586878889"
+        "90919293949596979899";
+    unsigned int l, j;
+    char *cp;
+
+    // Find out how long the number is (could consider clzll)
+    uint64_t m = 1;
+    l = 0;
+    if (sizeof(long long)==sizeof(uint64_t) && x >= 10000000000000000000ULL) {
+	// avoids overflow below
+	l = 20;
+    } else {
+	do {
+	    l++;
+	    m *= 10;
+	} while (x >= m);
+    }
+
+    // Add digits two at a time
+    j = l;
+    cp = s->s + s->l;
+    while (x >= 10) {
+        const char *d = &kputull_dig2r[2*(x%100)];
+        x /= 100;
+        memcpy(&cp[j-=2], d, 2);
+    }
+
+    // Last one (if necessary).  We know that x < 10 by now.
+    if (j == 1)
+        cp[0] = x + '0';
+
+    s->l += l;
+    s->s[s->l] = 0;
+    return 0;
 }
 
 static inline int kputl(long c, kstring_t *s) {
diff --git a/test/test_kstring.c b/test/test_kstring.c
index feb8243df..f942656f1 100644
--- a/test/test_kstring.c
+++ b/test/test_kstring.c
@@ -261,6 +261,84 @@ static int test_kputw(int64_t start, int64_t end) {
     return 0;
 }
 
+static int test_kputll_from_to(kstring_t *str, long long s, long long e) {
+    long long i = s;
+
+    for (;;) {
+        str->l = 0;
+        memset(str->s, 0xff, str->m);
+        if (kputll(i, str) < 0 || !str->s) {
+            perror("kputll");
+            return -1;
+        }
+        if (str->l >= str->m || str->s[str->l] != '\0') {
+            fprintf(stderr, "No NUL termination on string from kputll\n");
+            return -1;
+        }
+        if (i != strtoll(str->s, NULL, 10)) {
+            fprintf(stderr,
+                    "kputll wrote the wrong value, expected %lld, got %s\n",
+                    i, str->s);
+            return -1;
+        }
+        if (i >= e) break;
+        i++;
+    }
+    return 0;
+}
+
+static int test_kputll(long long start, long long end) {
+    kstring_t str = { 0, 0, NULL };
+    unsigned long long val;
+
+    str.s = malloc(2);
+    if (!str.s) {
+        perror("malloc");
+        return -1;
+    }
+    str.m = 2;
+
+    for (val = 1; val < INT64_MAX-5; val *= 10) {
+        if (test_kputll_from_to(&str, val >= 5 ? val - 5 : val, val) < 0) {
+            free(ks_release(&str));
+            return -1;
+        }
+    }
+
+    for (val = 1; val < INT64_MAX-5; val *= 10) {
+        long long valm = -val;
+        if (test_kputll_from_to(&str, valm >= 5 ? valm - 5 : valm, valm) < 0) {
+            free(ks_release(&str));
+            return -1;
+        }
+    }
+
+    if (test_kputll_from_to(&str, INT64_MAX - 5, INT64_MAX) < 0) {
+        free(ks_release(&str));
+        return -1;
+    }
+
+    if (test_kputll_from_to(&str, INT64_MIN, INT64_MIN + 5) < 0) {
+        free(ks_release(&str));
+        return -1;
+    }
+
+    str.m = 1; // Force a resize
+    int64_t start2 = (int64_t)start; // no larger on our platforms
+    int64_t end2   = (int64_t)end;
+    clamp(&start2, INT64_MIN, INT64_MAX);
+    clamp(&end2,   INT64_MIN, INT64_MAX);
+
+    if (test_kputll_from_to(&str, start, end) < 0) {
+        free(ks_release(&str));
+        return -1;
+    }
+
+    free(ks_release(&str));
+
+    return 0;
+}
+
 // callback used by test_kgetline
 static char *mock_fgets(char *str, int num, void *p) {
     int *mock_state = (int*)p;
@@ -413,6 +491,9 @@ int main(int argc, char **argv) {
     if (!test || strcmp(test, "kputw") == 0)
         if (test_kputw(start, end) != 0) res = EXIT_FAILURE;
 
+    if (!test || strcmp(test, "kputll") == 0)
+        if (test_kputll(start, end) != 0) res = EXIT_FAILURE;
+
     if (!test || strcmp(test, "kgetline") == 0)
         if (test_kgetline() != 0) res = EXIT_FAILURE;
 

From db2b449a1284b65cb977a73ed6ee09363a8c5870 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Thu, 18 Jul 2024 02:25:25 +1200
Subject: [PATCH 47/76] Miscellaneous minor build infrastructure and code fixes
 (PR #1807)

* Ignore and clean test/*/FAIL* for six subdirectories

These files can appear in base_mods, fastq, mpileup, and sam_filter
as well as faidx and tabix.

* Fix comment header to use `@CO\t` as per other comment headers

* Remove extraneous inclusion and add missing dependency

* Remove last traces of previously deleted bgzf_idx_amend_last()

As noted in #1722, this function was removed in PR #1672.

* Use isspace_c() et al in annot-tsv.c

* Minor corrections to system headers

Plain getopt() is declared in <unistd.h>; strcasecmp() et al are only
portably declared in <strings.h>.
---
 .gitignore                     |  3 +--
 Makefile                       | 10 +++++-----
 annot-tsv.c                    |  7 ++++---
 cram/cram_external.c           |  1 -
 hts_internal.h                 | 12 ------------
 samples/mod_aux.c              |  1 +
 samples/mod_bam.c              |  1 +
 test/base_mods/MM-explicit.sam |  2 +-
 test/test_faidx.c              |  2 +-
 9 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/.gitignore b/.gitignore
index 87496ade8..99abbd0f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,9 +45,9 @@ shlib-exports-*.txt
 /bgzip
 /htsfile
 /tabix
+/test/*/FAIL*
 /test/bgzf_boundaries/*.tmp.*
 /test/faidx/*.tmp*
-/test/faidx/FAIL*
 /test/fieldarith
 /test/hfile
 /test/hts_endian
@@ -57,7 +57,6 @@ shlib-exports-*.txt
 /test/plugins-dlhts
 /test/sam
 /test/tabix/*.tmp.*
-/test/tabix/FAIL*
 /test/test-bcf-sr
 /test/test-bcf-translate
 /test/test-bcf_set_variant_type
diff --git a/Makefile b/Makefile
index 75b570a34..8af7558e2 100644
--- a/Makefile
+++ b/Makefile
@@ -526,10 +526,10 @@ htsfile: htsfile.o libhts.a
 tabix: tabix.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread
 
-annot-tsv.o: annot-tsv.c config.h $(htslib_hts_h) $(htslib_hts_defs_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_regidx_h)
+annot-tsv.o: annot-tsv.c config.h $(htslib_hts_h) $(htslib_hts_defs_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_regidx_h) $(textutils_internal_h)
 bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h)
 htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h)
-tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h)
+tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_thread_pool_h)
 
 # Runes to check that the htscodecs submodule is present
 ifdef HTSCODECS_SOURCES
@@ -924,9 +924,9 @@ htslib-uninstalled.pc: htslib.pc.tmp
 
 
 testclean:
-	-rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* test/faidx/FAIL* \
-               test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* \
-               test/bgzf_boundaries/*.tmp.* \
+	-rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* \
+               test/longrefs/*.tmp.* test/tabix/*.tmp.* \
+               test/bgzf_boundaries/*.tmp.* test/*/FAIL* \
                header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt
 	-rm -rf htscodecs/tests/test.out
 
diff --git a/annot-tsv.c b/annot-tsv.c
index cbbf165e3..e453ede5b 100644
--- a/annot-tsv.c
+++ b/annot-tsv.c
@@ -44,6 +44,7 @@
 #include "htslib/kseq.h"
 #include "htslib/bgzf.h"
 #include "htslib/regidx.h"
+#include "textutils_internal.h"
 
 #define ANN_NBP     1
 #define ANN_FRAC    2
@@ -409,15 +410,15 @@ void parse_header(dat_t *dat, char *fname, int nth_row, int autodetect)
     for (i=0; i<cols->n; i++)
     {
         char *ss = cols->off[i];
-        while ( *ss && (*ss=='#' || isspace(*ss)) ) ss++;
+        while ( *ss && (*ss=='#' || isspace_c(*ss)) ) ss++;
         if ( !*ss ) error("Could not parse the header field \"%s\": %s\n", cols->off[i],dat->line.s);
         if ( *ss=='[' )
         {
             char *se = ss+1;
-            while ( *se && isdigit(*se) ) se++;
+            while ( *se && isdigit_c(*se) ) se++;
             if ( *se==']' ) ss = se + 1;
         }
-        while ( *ss && (*ss=='#' || isspace(*ss)) ) ss++;
+        while ( *ss && (*ss=='#' || isspace_c(*ss)) ) ss++;
         if ( !*ss ) error("Could not parse the header field \"%s\": %s\n", cols->off[i],dat->line.s);
         cols->off[i] = ss;
         khash_str2int_set(dat->hdr.name2idx, cols->off[i], i);
diff --git a/cram/cram_external.c b/cram/cram_external.c
index 1102e8daa..c6d7d66af 100644
--- a/cram/cram_external.c
+++ b/cram/cram_external.c
@@ -49,7 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #include "../htslib/hfile.h"
-#include "../hfile_internal.h"
 #include "cram.h"
 
 /*
diff --git a/hts_internal.h b/hts_internal.h
index 70abbb29c..52f29e6c1 100644
--- a/hts_internal.h
+++ b/hts_internal.h
@@ -123,18 +123,6 @@ const char *hts_plugin_path(void);
  */
 int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped);
 
-/*
- * bgzf analogue to hts_idx_amend_last.
- *
- * This is needed when multi-threading and writing indices on the fly.
- * At the point of writing a record we know the virtual offset for start
- * and end, but that end virtual offset may be the end of the current
- * block.  In standard indexing our end virtual offset becomes the start
- * of the next block.  Thus to ensure bit for bit compatibility we
- * detect this boundary case and fix it up here.
- */
-void bgzf_idx_amend_last(BGZF *fp, hts_idx_t *hidx, uint64_t offset);
-
 static inline int find_file_extension(const char *fn, char ext_out[static HTS_MAX_EXT_LEN])
 {
     const char *delim = fn ? strstr(fn, HTS_IDX_DELIM) : NULL, *ext;
diff --git a/samples/mod_aux.c b/samples/mod_aux.c
index d5ed18cde..b6e75fb0b 100644
--- a/samples/mod_aux.c
+++ b/samples/mod_aux.c
@@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE
 /* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
 
 #include <getopt.h>
+#include <strings.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
diff --git a/samples/mod_bam.c b/samples/mod_bam.c
index 9f1eb324e..6c56e62d1 100644
--- a/samples/mod_bam.c
+++ b/samples/mod_bam.c
@@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE
 /* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
 
 #include <getopt.h>
+#include <strings.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
diff --git a/test/base_mods/MM-explicit.sam b/test/base_mods/MM-explicit.sam
index e85afa293..c230a9d82 100644
--- a/test/base_mods/MM-explicit.sam
+++ b/test/base_mods/MM-explicit.sam
@@ -19,7 +19,7 @@
 @CO	ATCATCATTCCTACCGCTATAGCCT  r3; mixture
 @CO	  -  -   .   -. -     --
 @CO	         M    M
-@CO       -  -   ??  ?? ?     --
+@CO	  -  -   ??  ?? ?     --
 @CO	         hH  hh h     --
 @CO	
 r1	0	*	0	0	*	*	0	0	ATCATCATTCCTACCGCTATAGCCT	*	Mm:Z:C+mh,2,0,1;	Ml:B:C,200,10,50,170,160,20
diff --git a/test/test_faidx.c b/test/test_faidx.c
index 566149071..f73f973a0 100644
--- a/test/test_faidx.c
+++ b/test/test_faidx.c
@@ -26,7 +26,7 @@ DEALINGS IN THE SOFTWARE.  */
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <getopt.h>
+#include <unistd.h>
 
 #include "../htslib/faidx.h"
 

From a135bc041dba09743abbeda66d3ec5dbd0a61cae Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Thu, 20 Jun 2024 10:10:51 +0100
Subject: [PATCH 48/76] Use FNV1a for string hashing

The existing X31 hash propagates bits fairly slowly, resulting in
a poor distribution of keys if most of the differences in strings
are at the end.  Fix by using FNV1a instead, which is a similar
speed to calculate but distributes keys much more effectively.

Includes kh_stats() function in khash which produces a histogram
of probe chain lengths and a khash test framework.  The test
program can also be used to benchmark insertion and lookup
times.
---
 .gitignore        |   1 +
 Makefile          |   6 +
 htslib/khash.h    |  87 +++++++-
 test/test_khash.c | 502 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 592 insertions(+), 4 deletions(-)
 create mode 100644 test/test_khash.c

diff --git a/.gitignore b/.gitignore
index 99abbd0f6..817b123d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,7 @@ shlib-exports-*.txt
 /test/test_index
 /test/test_introspection
 /test/test_kfunc
+/test/test_khash
 /test/test_kstring
 /test/test_mod
 /test/test_nibbles
diff --git a/Makefile b/Makefile
index 8af7558e2..8cea55554 100644
--- a/Makefile
+++ b/Makefile
@@ -85,6 +85,7 @@ BUILT_TEST_PROGRAMS = \
 	test/test_expr \
 	test/test_faidx \
 	test/test_kfunc \
+	test/test_khash \
 	test/test_kstring \
 	test/test_mod \
 	test/test_nibbles \
@@ -605,6 +606,7 @@ check test: all $(HTSCODECS_TEST_TARGETS)
 	test/hts_endian
 	test/test_expr
 	test/test_kfunc
+	test/test_khash
 	test/test_kstring
 	test/test_nibbles -v
 	test/test_str2int
@@ -669,6 +671,9 @@ test/test_faidx: test/test_faidx.o libhts.a
 test/test_kfunc: test/test_kfunc.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread
 
+test/test_khash: test/test_khash.o libhts.a
+	$(CC) $(LDFLAGS) -o $@ test/test_khash.o libhts.a -lz $(LIBS) -lpthread
+
 test/test_kstring: test/test_kstring.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a -lz $(LIBS) -lpthread
 
@@ -778,6 +783,7 @@ test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_fa
 test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(hfile_internal_h)
 test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h)
 test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h)
+test/test_khash.o: test/test_khash.c config.h $(htslib_khash_h) $(htslib_kroundup_h)
 test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h)
 test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h)
 test/test_nibbles.o: test/test_nibbles.c config.h $(htslib_sam_h) $(sam_internal_h)
diff --git a/htslib/khash.h b/htslib/khash.h
index 4cea91020..02e4917c8 100644
--- a/htslib/khash.h
+++ b/htslib/khash.h
@@ -1,7 +1,7 @@
 /* The MIT License
 
    Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2014-2015, 2018 Genome Research Ltd.
+   Copyright (C) 2014-2015, 2018, 2024 Genome Research Ltd.
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -356,7 +356,39 @@ static const double __ac_HASH_UPPER = 0.77;
 			__ac_set_isdel_true(h->flags, x);							\
 			--h->size;													\
 		}																\
-	}
+	}                                                                   \
+    SCOPE int kh_stats_##name(kh_##name##_t *h, khint_t *empty,         \
+                              khint_t *deleted, khint_t *hist_size,     \
+                              khint_t **hist_out)                       \
+    {                                                                   \
+        khint_t i, *hist = NULL, dist_max = 0, k, dist, step;           \
+        khint_t mask = h->n_buckets - 1;                                \
+        *empty = *deleted = *hist_size = 0;                             \
+        hist = (khint_t *) calloc(1, sizeof(*hist));                    \
+        if (!hist) { return -1; }                                       \
+        for (i = kh_begin(h); i < kh_end(h); ++i) {                     \
+            if (__ac_isempty(h->flags, i)) { (*empty)++; continue; }      \
+            if (__ac_isdel(h->flags, i)) { (*deleted)++; continue; }      \
+            k = __hash_func(h->keys[i]) & (h->n_buckets - 1);           \
+            dist = 0;                                                   \
+            step = 0;                                                   \
+            while (k != i) {                                            \
+                dist++;                                                 \
+                k = (k + (++step)) & mask;                              \
+            }                                                           \
+            if (dist_max <= dist) {                                     \
+                khint_t *new_hist = (khint_t *) realloc(hist, sizeof(*new_hist) * (dist + 1)); \
+                if (!new_hist) { free(hist); return -1; }               \
+                for (k = dist_max + 1; k <= dist; k++) new_hist[k] = 0; \
+                hist = new_hist;                                        \
+                dist_max = dist;                                        \
+            }                                                           \
+            hist[dist]++;                                               \
+        }                                                               \
+        *hist_out = hist;                                               \
+        *hist_size = dist_max + 1;                                      \
+        return 0;                                                       \
+    }
 
 #define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
 	__KHASH_TYPE(name, khkey_t, khval_t) 								\
@@ -391,6 +423,7 @@ static const double __ac_HASH_UPPER = 0.77;
   @abstract     64-bit integer comparison function
  */
 #define kh_int64_hash_equal(a, b) ((a) == (b))
+
 /*! @function
   @abstract     const char* hash function
   @param  s     Pointer to a null terminated string
@@ -402,12 +435,28 @@ static kh_inline khint_t __ac_X31_hash_string(const char *s)
 	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
 	return h;
 }
+
+/*! @function
+  @abstract     const char* FNV1a hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_FNV1a_hash_string(const char *s)
+{
+	const khint_t offset_basis = 2166136261;
+	const khint_t FNV_prime = 16777619;
+	khint_t h = offset_basis;
+	for (; *s; ++s) h = (h ^ (uint8_t) *s) * FNV_prime;
+	return h;
+}
+
 /*! @function
   @abstract     Another interface to const char* hash function
   @param  key   Pointer to a nul terminated string [const char*]
   @return       The hash value [khint_t]
  */
-#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+#define kh_str_hash_func(key) __ac_FNV1a_hash_string(key)
+
 /*! @function
   @abstract     Const char* comparison function
  */
@@ -426,12 +475,29 @@ static kh_inline khint_t __ac_X31_hash_kstring(const kstring_t ks)
 		h = (h << 5) - h + (khint_t)ks.s[i];
 	return h;
 }
+
+/*! @function
+  @abstract     Kstring hash function
+  @param  s     Pointer to a kstring
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_FNV1a_hash_kstring(const kstring_t ks)
+{
+	const khint_t offset_basis = 2166136261;
+	const khint_t FNV_prime = 16777619;
+	khint_t h = offset_basis;
+	size_t i;
+	for (i = 0; i < ks.l; i++)
+		h = (h ^ (uint8_t) ks.s[i]) * FNV_prime;
+	return h;
+}
+
 /*! @function
   @abstract     Interface to kstring hash function.
   @param  key   Pointer to a khash; permits hashing on non-nul terminated strings.
   @return       The hash value [khint_t]
  */
-#define kh_kstr_hash_func(key) __ac_X31_hash_kstring(key)
+#define kh_kstr_hash_func(key) __ac_FNV1a_hash_kstring(key)
 /*! @function
   @abstract     kstring comparison function
  */
@@ -604,6 +670,19 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key)
 		code;												\
 	} }
 
+/*! @function
+  @abstract  Gather hash table statistics
+  @param  name            Name of the hash table [symbol]
+  @param  h               Pointer to the hash table [khash_t(name)*]
+  @param  empty[out]      Number of empty hash bins
+  @param  deleted[out]    Number of hash bins with the deleted flag
+  @param  hist_size[out]  Size of @p hist array
+  @param  hist[out]       Probe count histogram
+  @return 0 on success; -1 on failure
+ */
+#define kh_stats(name, h, empty, deleted, hist_size, hist) \
+    kh_stats_##name(h, empty, deleted, hist_size, hist)
+
 /* More convenient interfaces */
 
 /*! @function
diff --git a/test/test_khash.c b/test/test_khash.c
new file mode 100644
index 000000000..a2e80b581
--- /dev/null
+++ b/test/test_khash.c
@@ -0,0 +1,502 @@
+/*  test_khash.c -- khash unit tests
+
+    Copyright (C) 2024 Genome Research Ltd.
+    Copyright (C) 2024 Centre for Population Genomics.
+
+    Author: Rob Davies <rmd@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.  */
+
+#include <config.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <getopt.h>
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+#include <time.h>
+#else
+#include <sys/time.h>
+#endif
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <htslib/khash.h>
+#include <htslib/kroundup.h>
+
+#define MAX_ENTRIES 99999999
+
+KHASH_MAP_INIT_STR(str2int, int)
+
+static void write_stats_str2int(khash_t(str2int) *h) {
+    khint_t empty = 0, deleted = 0, hist_size = 0, *hist = NULL;
+
+    if (kh_stats(str2int, h, &empty, &deleted, &hist_size, &hist) == 0) {
+        khint_t i;
+        printf("n_buckets = %u\n",
+                kh_n_buckets(h));
+        printf("empty     = %u\n", empty);
+        printf("deleted   = %u\n", deleted);
+        for (i = 0; i < hist_size; i++) {
+            printf("dist[ %8u ] = %u\n", i, hist[i]);
+        }
+        free(hist);
+    }
+}
+
+char * make_keys(size_t num, size_t kl) {
+    size_t i;
+    char *keys;
+
+    if (num > MAX_ENTRIES) return NULL;
+    keys = malloc(kl * num);
+    if (!keys) {
+        perror(NULL);
+        return NULL;
+    }
+    for (i = 0; i < num; i++) {
+        if (snprintf(keys + kl * i, kl, "test%zu", i) >= kl) {
+            free(keys);
+            return NULL;
+        }
+    }
+
+    return keys;
+}
+
+static int add_str2int_entry(khash_t(str2int) *h, char *key, khint_t val) {
+    int ret = 0;
+    khint_t k = kh_put(str2int, h, key, &ret);
+
+    if (ret != 1 && ret != 2) {
+        fprintf(stderr, "Unexpected return from kh_put(%s) : %d\n", key, ret);
+        return -1;
+    }
+    kh_val(h, k) = val;
+    return 0;
+}
+
+static int check_str2int_entry(khash_t(str2int) *h, char *key, khint_t val,
+                               uint8_t is_deleted) {
+    khint_t k = kh_get(str2int, h, key);
+    if (is_deleted) {
+        if (k < kh_end(h)) {
+            fprintf(stderr, "Found deleted entry %s in hash table\n", key);
+            return -1;
+        } else {
+            return 0;
+        }
+    }
+
+    if (k >= kh_end(h)) {
+        fprintf(stderr, "Couldn't find %s in hash table\n", key);
+        return -1;
+    }
+    if (strcmp(kh_key(h, k), key) != 0) {
+        fprintf(stderr, "Wrong key in hash table, expected %s got %s\n",
+                key, kh_key(h, k));
+        return -1;
+    }
+    if (kh_val(h, k) != val) {
+        fprintf(stderr, "Wrong value in hash table, expected %u got %u\n",
+                val, kh_val(h, k));
+        return -1;
+    }
+    return 0;
+}
+
+static int del_str2int_entry(khash_t(str2int) *h, char *key) {
+    khint_t k = kh_get(str2int, h, key);
+    if (k >= kh_end(h)) {
+        fprintf(stderr, "Couldn't find %s to delete from hash table\n", key);
+        return -1;
+    }
+    kh_del(str2int, h, k);
+    return 0;
+}
+
+static int test_str2int(size_t max, size_t to_del, int show_stats) {
+    const size_t kl = 16;
+    size_t mask = max;
+    char *keys = make_keys(max, kl);
+    uint8_t *flags = NULL;
+    khash_t(str2int) *h;
+    khint_t i;
+    uint32_t r = 0x533d;
+
+    if (!keys) return -1;
+
+    h = kh_init(str2int);
+    if (!h) goto memfail;
+
+    // Add some entries
+    for (i = 0; i < max; i++) {
+        if (add_str2int_entry(h, keys + i * kl, i) != 0)
+            goto fail;
+    }
+
+    // Check they exist
+    for (i = 0; i < max; i++) {
+        if (check_str2int_entry(h, keys + i * kl, i, 0) != 0)
+            goto fail;
+    }
+
+    if (show_stats) {
+        printf("Initial fill:\n");
+        write_stats_str2int(h);
+    }
+
+    // Delete a random selection
+    flags = calloc(max, sizeof(*flags));
+    if (!flags) {
+        perror("");
+        goto fail;
+    }
+
+    kroundup_size_t(mask);
+    --mask;
+
+    // Note that this method may become slow for a high %age removed
+    // as it searches for the last available entries.  Despite this, it
+    // seems to be acceptable for the number of entries allowed.
+    for (i = 0; i < to_del; i++) {
+        khint_t victim;
+        // LFSR, see http://users.ece.cmu.edu/~koopman/lfsr/index.html
+        do {
+            r = (r >> 1) ^ ((r & 1) * 0x80000057U);
+            victim = (r & mask) - 1;
+        } while (victim >= max || flags[victim]);
+        if (del_str2int_entry(h, keys + victim * kl) != 0)
+            goto fail;
+        flags[victim] = 1;
+    }
+
+    // Check correct entries are present
+    for (i = 0; i < max; i++) {
+        if (check_str2int_entry(h, keys + i * kl, i, flags[i]) != 0)
+            goto fail;
+    }
+
+    if (show_stats) {
+        printf("\nAfter deletion:\n");
+        write_stats_str2int(h);
+    }
+
+    // Re-insert deleted entries
+    for (i = 0; i < max; i++) {
+        if (flags[i] && add_str2int_entry(h, keys + i * kl, i) != 0)
+            goto fail;
+    }
+
+    // Ensure they're all back
+    for (i = 0; i < max; i++) {
+        if (check_str2int_entry(h, keys + i * kl, i, 0) != 0)
+            goto fail;
+    }
+
+    if (show_stats) {
+        printf("\nAfter re-insert:\n");
+        write_stats_str2int(h);
+    }
+
+    kh_destroy(str2int, h);
+    free(keys);
+    free(flags);
+
+    return 0;
+
+ memfail:
+    perror(NULL);
+ fail:
+    kh_destroy(str2int, h);
+    free(keys);
+    free(flags);
+    return -1;
+}
+
+static size_t read_keys(const char *keys_file, char **keys_out,
+                        char ***key_locations_out) {
+    FILE *in = fopen(keys_file, "r");
+    char *keys = NULL, *key, *end;
+    size_t keys_size = 1000000;
+    size_t keys_used = 0;
+    size_t avail, got, nkeys = 0;
+    char **key_locations = NULL;
+    struct stat fileinfo = { 0 };
+
+    if (!in)
+        return 0;
+
+    // Slurp entire file
+    if (fstat(fileno(in), &fileinfo) < 0) {
+        if (fileinfo.st_size > keys_size)
+            keys_size = (size_t) fileinfo.st_size;
+    }
+
+    keys = malloc(keys_size + 1);
+    if (!keys)
+        goto fail;
+
+    do {
+        avail = keys_size - keys_used;
+        if (avail == 0) {
+            size_t new_size = keys_size + 1000000;
+            char *new_keys = realloc(keys, new_size + 1);
+            if (!new_keys)
+                goto fail;
+            keys = new_keys;
+            keys_size = new_size;
+            avail = keys_size - keys_used;
+        }
+        got = fread(keys + keys_used, 1, avail, in);
+        keys_used += got;
+    } while (got == avail);
+    keys[keys_used] = '\0';
+
+    if (ferror(in))
+        goto fail;
+    if (fclose(in) < 0)
+        goto fail;
+    in = NULL;
+
+    // Split by line
+    end = keys + keys_used;
+    for (key = keys; key != NULL; key = memchr(key, '\n', end - key)) {
+        while (*key == '\n') key++;
+        if (key < end) nkeys++;
+    }
+
+    key_locations = malloc(nkeys * sizeof(*key_locations));
+    if (!key_locations)
+        goto fail;
+
+    nkeys = 0;
+    for (key = keys; key != NULL; key = memchr(key, '\n', end - key)) {
+        while (*key == '\n') *key++ = '\0';
+        if (key < end) {
+            key_locations[nkeys++] = key;
+        }
+    }
+    *keys_out = keys;
+    *key_locations_out = key_locations;
+    return nkeys;
+
+ fail:
+    if (in)
+        fclose(in);
+    free(keys);
+    *keys_out = NULL;
+    *key_locations_out = NULL;
+    return 0;
+}
+
+static long long get_time(void) {
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+    struct timespec ts;
+    if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts) < 0) {
+        perror("clock_gettime");
+        return -1;
+    }
+    return ts.tv_sec * 1000000000LL + ts.tv_nsec;
+#else
+    struct timeval tv;
+    if (gettimeofday(&tv, NULL) < 0) {
+        perror("gettimeofday");
+        return -1;
+    }
+    return tv.tv_sec * 1000000LL + tv.tv_usec;
+#endif
+}
+
+static char * fmt_time(long long elapsed) {
+    static char buf[64];
+#ifdef HAVE_CLOCK_GETTIME_CPUTIME
+    long long sec = elapsed / 1000000000;
+    long long nsec = elapsed % 1000000000;
+    snprintf(buf, sizeof(buf), "%lld.%09lld processor seconds", sec, nsec);
+#else
+    long long sec = elapsed / 1000000;
+    long long usec = elapsed % 1000000;
+    snprintf(buf, sizeof(buf), "%lld.%06lld wall-time seconds", sec, usec);
+#endif
+    return buf;
+}
+
+static int benchmark(const char *keys_file) {
+    const size_t kl = 16;
+    size_t max = 50000000;
+    size_t i;
+    char *keys = NULL;
+    char **key_locations = NULL;
+    khash_t(str2int) *h;
+    long long start, end;
+
+    if (keys_file) {
+        max = read_keys(keys_file, &keys, &key_locations);
+    } else {
+        keys = make_keys(max, kl);
+    }
+
+    if (!keys) return -1;
+
+    h = kh_init(str2int);
+    if (!h) goto fail;
+
+    if ((start = get_time()) < 0)
+        goto fail;
+
+    if (keys_file) {
+        for (i = 0; i < max; i++) {
+            int ret;
+            khint_t k = kh_put(str2int, h, key_locations[i], &ret);
+            if (ret < 0) {
+                fprintf(stderr, "Unexpected return from kh_put(%s) : %d\n",
+                        key_locations[i], ret);
+                goto fail;
+            }
+            kh_val(h, k) = i;
+        }
+    } else {
+        for (i = 0; i < max; i++) {
+            int ret;
+            khint_t k = kh_put(str2int, h, keys + i * kl, &ret);
+            if (ret <= 0) {
+                fprintf(stderr, "Unexpected return from kh_put(%s) : %d\n",
+                        keys + i * kl, ret);
+                goto fail;
+            }
+            kh_val(h, k) = i;
+        }
+    }
+
+    if ((end = get_time()) < 0)
+        goto fail;
+
+    printf("Insert %zu %s\n", max, fmt_time(end - start));
+
+    if ((start = get_time()) < 0)
+        goto fail;
+
+    if (keys_file) {
+        for (i = 0; i < max; i++) {
+            khint_t k = kh_get(str2int, h, key_locations[i]);
+            if (k >= kh_end(h)) {
+                fprintf(stderr, "Couldn't find %s in hash table\n",
+                        key_locations[i]);
+                goto fail;
+            }
+        }
+    } else {
+        for (i = 0; i < max; i++) {
+            khint_t k = kh_get(str2int, h, keys + i * kl);
+            if (k >= kh_end(h)) {
+                fprintf(stderr, "Couldn't find %s in hash table\n",
+                        keys + i * kl);
+                goto fail;
+            }
+        }
+    }
+
+    if ((end = get_time()) < 0)
+        goto fail;
+
+    printf("Lookup %zu %s\n", max, fmt_time(end - start));
+
+    write_stats_str2int(h);
+
+    kh_destroy(str2int, h);
+    free(keys);
+    free(key_locations);
+
+    return 0;
+ fail:
+    kh_destroy(str2int, h);
+    free(keys);
+    return -1;
+}
+
+static void show_usage(FILE *out, char *prog) {
+    fprintf(out, "Usage : %s [-t <test>] [-i <file>]\n", prog);
+    fprintf(out, " Options:\n");
+    fprintf(out, "  -t <TEST>   Test to run (str2int, benchmark)\n");
+    fprintf(out, "  -i <FILE>   Optional input file for benchmark\n");
+    fprintf(out, "  -n <INT>    Number of items to add\n");
+    fprintf(out, "  -f <FRAC>   Fraction to delete and re-insert\n");
+    fprintf(out, "  -d          Dump hash table stats\n");
+    fprintf(out, "  -h          Show this help\n");
+}
+
+int main(int argc, char **argv) {
+    int opt, res = EXIT_SUCCESS;
+    char *test = NULL;
+    char *input_file = NULL;
+    size_t max = 1000;
+    double del_frac = 0.25;
+    int show_stats = 0;
+
+    while ((opt = getopt(argc, argv, "df:hi:n:t:")) != -1) {
+        switch (opt) {
+        case 'd':
+            show_stats = 1;
+            break;
+        case 'f':
+            del_frac = strtod(optarg, NULL);
+            if (del_frac < 0 || del_frac > 1.0) {
+                fprintf(stderr, "Error: -d must be between 0.0 and 1.0\n");
+                return EXIT_FAILURE;
+            }
+            break;
+        case 'h':
+            show_usage(stdout, argv[0]);
+            return EXIT_SUCCESS;
+        case 'i':
+            input_file = optarg;
+            break;
+        case 'n':
+            max = strtoul(optarg, NULL, 0);
+            if (max == 0 || max > 99999999) {
+                fprintf(stderr, "Error: -n must be between 1 and %u\n",
+                        MAX_ENTRIES);
+                return EXIT_FAILURE;
+            }
+            break;
+        case 't':
+            test = optarg;
+            break;
+        default:
+            show_usage(stderr, argv[0]);
+            return EXIT_FAILURE;
+        }
+    }
+
+    if (!test || strcmp(test, "str2int") == 0) {
+        if (test_str2int(max, (size_t) (max * del_frac), show_stats) != 0)
+            res = EXIT_FAILURE;
+    }
+
+    if (test && strcmp(test, "benchmark") == 0) {
+        if (benchmark(input_file) != 0)
+            res = EXIT_FAILURE;
+    }
+
+    return res;
+}

From f8016c0198bc7033cb6769e0394e18fbdfe353f1 Mon Sep 17 00:00:00 2001
From: vasudeva8 <vasudeva.sarma@sanger.ac.uk>
Date: Mon, 21 Aug 2023 17:42:26 +0100
Subject: [PATCH 49/76] Added qtask_ordered, qtask_unordered and index_fasta
 examples.

The qtask programs demonstrate of use ordered and unordered result
processing from a thread pool.

The index_fasta and read_fast_index tools demonstrate the fasta
indexing capabilities.

Improved the Makefile and tidied up the indentation and code example
layout / structure in DEMO.md

Also several minor wording and spelling fixes.
---
 samples/DEMO.md               | 717 ++++++++++++++++++++++++----------
 samples/Makefile              |  71 ++--
 samples/README.md             |  41 +-
 samples/add_header.c          |   8 +-
 samples/cram.c                |   6 +-
 samples/dump_aux.c            |   8 +-
 samples/flags_demo.c          |   6 +-
 samples/flags_htsopt_field.c  |   6 +-
 samples/index_fasta.c         |  72 ++++
 samples/index_multireg_read.c |   6 +-
 samples/index_reg_read.c      |   8 +-
 samples/index_write.c         |   6 +-
 samples/mod_aux.c             |   6 +-
 samples/mod_aux_ba.c          |   6 +-
 samples/mod_bam.c             |   6 +-
 samples/modstate.c            |   6 +-
 samples/mpileup.c             |   6 +-
 samples/pileup.c              |   6 +-
 samples/pileup_mod.c          |   6 +-
 samples/qtask_ordered.c       | 425 ++++++++++++++++++++
 samples/qtask_unordered.c     | 320 +++++++++++++++
 samples/read_aux.c            |  10 +-
 samples/read_bam.c            |   6 +-
 samples/read_fast.c           |  11 +-
 samples/read_fast_index.c     | 163 ++++++++
 samples/read_header.c         |   6 +-
 samples/read_refname.c        |   6 +-
 samples/rem_header.c          |   8 +-
 samples/sample.bed            |   4 +
 samples/sample.ref.fq         |  16 +
 samples/sample.sam            |   2 +-
 samples/split.c               |   6 +-
 samples/split2.c              |   8 +-
 samples/split_thread1.c       |  10 +-
 samples/split_thread2.c       |   6 +-
 samples/update_header.c       |   6 +-
 samples/write_fast.c          |  36 +-
 37 files changed, 1704 insertions(+), 342 deletions(-)
 create mode 100644 samples/index_fasta.c
 create mode 100644 samples/qtask_ordered.c
 create mode 100644 samples/qtask_unordered.c
 create mode 100644 samples/read_fast_index.c
 create mode 100644 samples/sample.bed
 create mode 100644 samples/sample.ref.fq

diff --git a/samples/DEMO.md b/samples/DEMO.md
index 911792899..98c9981b8 100644
--- a/samples/DEMO.md
+++ b/samples/DEMO.md
@@ -88,18 +88,24 @@ alignment. It adds count of ATCGN base as an array in auxiliary data, BA:I.
 Modified data is written on standard output.
 
 Write_fast - This application showcases the fasta/fastq data write. It appends
-a dummy data to given file.
+data to given file.
 
 Index_write - This application showcases the creation of index along with
 output creation. Based on file type and shift, it creates bai, csi or crai
 files.
 
+Index_fast - This application showcases the index creation on fasta/fastq
+reference files.
+
 Read_reg - This application showcases the usage of region specification in
 alignment read.
 
-Read_multireg - This application showcases the usage of mulitple regionn
+Read_multireg - This application showcases the usage of multiple region
 specification in alignment read.
 
+Read_fast_index - This application showcases the fasta/fastq data read using
+index.
+
 Pileup - This application showcases the pileup api, where all alignments
 covering a reference position are accessed together. It displays the bases
 covering each position on standard output.
@@ -131,6 +137,15 @@ handling. It saves the read1 and read2 as separate files in given directory,
 one as sam and other as bam. A pool of 4 threads is created and shared for both
 read and write.
 
+Qtask_ordered - This application showcases the use of queues and threads for
+custom processing. Alignments in input file are updated with their GC ratio
+on a custom aux tag. The processing may occur in any order but the result is
+retrieved in same order as it was queued and saved to disk.
+
+Qtask_unordered - This application showcases the use of queues and threads
+for custom processing. The count of bases and GC ratio are calculated and
+displayed.  The order of counting is irrelevant and hence ordered retrieval is
+not used.
 
 ## Building the sample apps
 
@@ -173,7 +188,7 @@ sam_read1 api. samFile pointer, header and bam storage are to be passed as
 argument and it returns 0 on success, -1 on end of file and < -1 in case of
 errors.
 
-The bam storage has to be initialised using bam_init1 api before the call and
+The bam storage has to be initialized using bam_init1 api before the call and
 can be reused for successive reads. Once done, it needs to be destroyed using
 bam_destroy1.  The member field named core - bam1_core_t - in bam storage,
 bam1_t, has the sequence data in an easily accessible way. Using the fields
@@ -185,30 +200,31 @@ and macros, data can easily be read from it.
     {
         ...
         //initialize
-        if (!(bamdata = bam_init1())) {
-        ...
+        if (!(bamdata = bam_init1()))
+           ... // error
         //open input files - r reading
-        if (!(infile = sam_open(inname, "r"))) {
-        ...
+        if (!(infile = sam_open(inname, "r")))
+           ... // error
         //read header
-        if (!(in_samhdr = sam_hdr_read(infile))) {
-        ...
+        if (!(in_samhdr = sam_hdr_read(infile)))
+           ... // error
+
         //read data, check flags and update count
         while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
-            if (bamdata->core.flag & BAM_FREAD1) {
+            if (bamdata->core.flag & BAM_FREAD1)
                 cntread1++;
-            }
-        ...
+            ...
+
         //clean up
-        if (in_samhdr) {
+        if (in_samhdr)
             sam_hdr_destroy(in_samhdr);
-        }
-        if (infile) {
+
+        if (infile)
             sam_close(infile);
-        }
-        if (bamdata) {
+
+        if (bamdata)
             bam_destroy1(bamdata);
-        }
+
         return ret;
     }
 Refer: flags_demo.c
@@ -255,21 +271,23 @@ set the reference name in the alignment. It returns -ve value on error.
     int main(int argc, char *argv[])
     {
         ...
-        if (!(infile = sam_open(inname, "r"))) {
-        ...
+        if (!(infile = sam_open(inname, "r")))
+           ... // error
         outfile1 = sam_open(file1, "w");            //as SAM
         outfile2 = sam_open(file2, "wb");           //as BAM
         ...
-        if (!(in_samhdr = sam_hdr_read(infile))) {
-        ...
+        if (!(in_samhdr = sam_hdr_read(infile)))
+           ... // error
+
         //write header
         if ((sam_hdr_write(outfile1, in_samhdr) == -1) ||
-         (sam_hdr_write(outfile2, in_samhdr) == -1)) {
-        ...
+         (sam_hdr_write(outfile2, in_samhdr) == -1))
+           ... // error
+
         while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
             if (bamdata->core.flag & BAM_FREAD1) {
                 if (sam_write1(outfile1, in_samhdr, bamdata) < 0) {
-        ...
+                    ... // error
     }
 Refer: split.c
 
@@ -284,10 +302,11 @@ Below code excerpt shows sam_open_mode api usage.
         ...
         //set file open mode based on file name for 1st and as explicit for 2nd
         if ((sam_open_mode(mode1+1, file1, NULL) == -1) ||
-         (sam_open_mode(mode2+1, file2, "sam.gz") == -1)) {
-        ...
-        if (!(infile = sam_open(inname, "r"))) {
-        ...
+         (sam_open_mode(mode2+1, file2, "sam.gz") == -1))
+           ... // error
+        if (!(infile = sam_open(inname, "r")))
+           ... // error
+
         //open output files
         outfile1 = sam_open(file1, mode1);                          //as compressed SAM through sam_open
         outfile2 = sam_open_format(file2, mode2, NULL);             //as compressed SAM through sam_open_format
@@ -321,7 +340,7 @@ api and used with sam_open_format api to create appropriate CRAM file.
             hts_parse_format(&fmt2, reffmt2) == -1 ||               //embed the reference internally
             hts_parse_format(&fmt3, "cram,embed_ref=2") == -1 ||    //embed autogenerated reference
             hts_parse_format(&fmt4, "cram,no_ref=1") == -1) {       //no reference data encoding at all
-    ...
+       ... // error
     outfile1 = sam_open_format(file1, "wc", &fmt1); outfile2 = sam_open_format(file2, "wc", &fmt2);
     ...
 Refer: cram.c
@@ -337,16 +356,20 @@ or explicit format text. This mode buffer can be used with sam_open or can be
 used with sam_open_format with explicit format information in htsFormat
 structure.
 
+It is the FASTA format which is mainly in use to store the reference data.
+
     ...
-    if (!(bamdata = bam_init1())) {
-    ...
-    if (!(infile = sam_open(inname, "r"))) {
-    ...
-    if (infile->format.format != fasta_format && infile->format.format != fastq_format) {
-    ...
-    if (!(in_samhdr = sam_hdr_read(infile))) {
-    ...
-    while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
+    if (!(bamdata = bam_init1()))
+      ... // error
+    if (!(infile = sam_open(inname, "r")))
+       ... // error
+    if (infile->format.format != fasta_format && infile->format.format != fastq_format)
+       ... // error
+    if (!(in_samhdr = sam_hdr_read(infile)))
+       ... // error
+
+    while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0)
+       ... // error
         printf("\nsequence: ");
         for (c = 0; c < bamdata->core.l_qseq; ++c) {
             printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]);
@@ -354,23 +377,22 @@ structure.
         if (infile->format.format == fastq_format) {
             printf("\nquality: ");
             for (c = 0; c < bamdata->core.l_qseq; ++c) {
-                printf("%c", bam_get_qual(bamdata)[c]);
+                printf("%c", bam_get_qual(bamdata)[c] + 33);
     ...
 Refer: read_fast.c
 
     ...
     char mode[4] = "a";
     ...
-    if (sam_open_mode(mode + 1, outname, NULL) < 0) {
-    ...
-    if (!(outfile = sam_open(outname, mode))) {
-    ...
-    if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0)
-     < 0) {
-    ...
+    if (sam_open_mode(mode + 1, outname, NULL) < 0)
+       ... // error
+    if (!(outfile = sam_open(outname, mode)))
+       ... // error
+    if (bam_set1(bamdata, strlen(name), name, BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, strlen(data), data, qual, 0) < 0)
+       ... // error
     if (sam_write1(outfile, out_samhdr, bamdata) < 0) {
         printf("Failed to write data\n");
-    ...
+        ...
 Refer: write_fast.c
 
 
@@ -388,18 +410,21 @@ line can be retrieved using sam_hdr_find_line_pos or sam_hdr_line_id with
 position and unique identifier values respectively.
 
     ...
-    if (!(in_samhdr = sam_hdr_read(infile))) {
-    ...
-            ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data);
+    if (!(in_samhdr = sam_hdr_read(infile)))
+        ... // error
     ...
-            ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data);
+      if (tag)
+          ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data);
+      else
+          ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data);
     ...
         linecnt = sam_hdr_count_lines(in_samhdr, header);
-    ...
-            ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data);
-    ...
-            ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data);
-    ...
+        ...
+            if (tag)
+                ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data);
+            else
+                ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data);
+        ...
 Refer: read_header.c
 
 This will show the VN tag's value from HD header.
@@ -417,16 +442,19 @@ Below code excerpt shows the reference names which has length above given value.
     ...
     //iterate and check each reference's length
     for (pos = 1, c = 0; c < linecnt; ++c) {
-        if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2)) {
-    ...
+        if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2))
+            ... // error
+
         size = atoll(data.s);
         if (size < minsize) {
             //not required
             continue;
         }
-        if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c))) {
-            //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same!
-    ...
+
+        //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same!
+        if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c)))
+            ... // error
+
         printf("%d,%s,%s\n", pos, id, data.s);
     ...
 Refer: read_refname.c
@@ -465,8 +493,8 @@ indexing the seq_nt16_str array.
         printf("MQUAL: %d\n", bamdata->core.qual);                              //map quality value
         cigar = bam_get_cigar(bamdata);                                         //retrieves the cigar data
         for (i = 0; i < bamdata->core.n_cigar; ++i) {                           //no. of cigar data entries
-            printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i]));   //the macros gives the count of operation
-             and the symbol of operation for given cigar entry
+            printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i]));
+            //the macros gives the count of operation and the symbol of operation for given cigar entry
         }
         printf("\nTLEN/ISIZE: %"PRIhts_pos"\n", bamdata->core.isize);
         data = bam_get_seq(bamdata);
@@ -475,8 +503,8 @@ indexing the seq_nt16_str array.
         ...
         for (i = 0; i < bamdata->core.l_qseq ; ++i) {       //sequence length
             printf("%c", seq_nt16_str[bam_seqi(data, i)]);  //retrieves the base from (internal compressed) sequence data
-        ...
-            printf("%c", bam_get_qual(bamdata)[i]+33);      //retrives the quality value
+            ...
+            printf("%c", bam_get_qual(bamdata)[i]+33);      //retrieves the quality value
         ...
 Refer: read_bam.c
 
@@ -516,15 +544,13 @@ given position of the array.
 
     ...
     while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
-        if (i % 2) {    //use options alternatively to demonstrate both
-            //option 1 - get data as string with tag and type
-            if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) {
-                printf("%s\n",sdata.s);
-    ...
-            //option 2 - get raw data
-            if (!(data = bam_aux_get(bamdata, tag))) {
-    ...
-                if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) {
+        //option 1 - get data as string with tag and type
+        if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) {
+            printf("%s\n",sdata.s);
+        ...
+        //option 2 - get raw data
+        if ((data = bam_aux_get(bamdata, tag)) != NULL) {
+            printauxdata(stdout, bam_aux_type(data), -1, data);
     ...
 Refer: read_aux.c
 
@@ -539,8 +565,8 @@ Shows the MD aux tag from alignments.
             printf("%.2s:%c:", bam_aux_tag(data), NULL != strchr("cCsSiI", bam_aux_type(data)) ? 'i' : bam_aux_type(data));
               //macros gets the tag and type of aux data
             //dump the data
-            if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) {
-    ...
+            printauxdata(stdout, bam_aux_type(data), -1, data);
+            ...
             data = bam_aux_next(bamdata, data);                                     //get the next aux data
     ...
 Refer: dump_aux.c
@@ -563,19 +589,22 @@ sam_hdr_write api does the write of the header data to file.
 
     ...
     //add SQ line with SN as TR1 and TR2
-    if (sam_hdr_add_lines(in_samhdr, &sq[0], 0)) {                                      //length as 0 for NULL terminated data
-    ...
+    if (sam_hdr_add_lines(in_samhdr, &sq[0], 0))                                        //length as 0 for NULL terminated data
+        ... // error
+
     //add RG line with ID as RG1
-    if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL)) {
-    ...
-    //add pg line
-    if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL)) {    //NULL is to indicate end of args
-    ...
-    if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL)) {                         //NULL is to indicate end of args
-    ...
+    if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL))
+        ... // error
+
+    //add PG/CO lines
+    if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL))      //NULL is to indicate end of args
+        ... // error
+    if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL))                           //NULL is to indicate end of args
+        ... // error
+
     //write output
-    if (sam_hdr_write(outfile, in_samhdr) < 0) {
-    ...
+    if (sam_hdr_write(outfile, in_samhdr) < 0)
+        ... // error
 Refer: add_header.c
 
 Not all type of header data can be removed but where it is possible, either a
@@ -585,14 +614,14 @@ to be used. To remove all lines of a type, header type and unique identifier
 field tag are to be used.
 
     ...
-        //remove specific line
-        if (sam_hdr_remove_line_id(in_samhdr, header, id, idval)) {
-    ...
-        //remove multiple lines of a header type
-        if (sam_hdr_remove_lines(in_samhdr, header, id, NULL)) {
-    ...
-    if (sam_hdr_write(outfile, in_samhdr) < 0) {
-    ...
+
+    //remove specific line
+    if (sam_hdr_remove_line_id(in_samhdr, header, id, idval) < 0)
+        ... // error
+
+    //remove multiple lines of a header type
+    if (sam_hdr_remove_lines(in_samhdr, header, id, NULL) < 0)
+        ... // error
 Refer: rem_header.c
 
 Shows the file content after removing SQ line with SN 2.
@@ -640,13 +669,12 @@ be easier than update of existing record.
             break;
             case 3:// RNAME
             case 7:// RNEXT
-                if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0) {
-    ...
+                if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0)
+                    ... // error
                 if (field == 3) {
                     //reference
                     bamdata->core.tid = ret;
-                }
-                else {
+                } else {
                     //mate reference
                     bamdata->core.mtid = ret;
                 }
@@ -659,20 +687,21 @@ be easier than update of existing record.
             break;
             case 6:// CIGAR
             {
-    ...
+                ...
                 //get cigar array and set all data in new bam record
-                if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0) {
-    ...
+                if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0)
+                    ... // error
                 if (bam_set1(newbam, bamdata->core.l_qname, bam_get_qname(bamdata), bamdata->core.flag, bamdata->core.tid,
                  bamdata->core.pos, bamdata->core.qual, ncigar, cigar, bamdata->core.mtid, bamdata->core.mpos,
                   bamdata->core.isize, bamdata->core.l_qseq, (const char*)bam_get_seq(bamdata),
-                   (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0) {
-    ...
+                   (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0)
+                    ... // error
+
                 //correct sequence data as input is expected in ascii format and not as compressed inside bam!
                 memcpy(bam_get_seq(newbam), bam_get_seq(bamdata), (bamdata->core.l_qseq + 1) / 2);
                 //copy the aux data
                 memcpy(bam_get_aux(newbam), bam_get_aux(bamdata), bam_get_l_aux(bamdata));
-    ...
+            ...
             break;
             case 8:// PNEXT
                 bamdata->core.mpos = atoll(val);
@@ -681,18 +710,16 @@ be easier than update of existing record.
                 bamdata->core.isize = atoll(val);
             break;
             case 10:// SEQ
-    ...
+                ...
                 for( c = 0; c < i; ++c) {
                     bam_set_seqi(bam_get_seq(bamdata), c, seq_nt16_table[(unsigned char)val[c]]);
                 }
             break;
             case 11:// QUAL
-    ...
-                for (c = 0; c < i; ++c) {
+                ...
+                for (c = 0; c < i; ++c)
                     val[c] -= 33;               //phred score from ascii value
-                }
                 memcpy(bam_get_qual(bamdata), val, i);
-    ...
 Refer: mod_bam.c
 
 Shows data with RNAME modified to T2.
@@ -707,33 +734,32 @@ present at all, it can be appended using bam_aux_append.
     //matched to qname, update aux
     if (!(data = bam_aux_get(bamdata, tag))) {
         //tag not present append
-    ...
-        if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val)) {
-    ...
-    else {
-        char auxtype = bam_aux_type(data);
+        ... // cut: computed length and val based on tag type
+        if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val))
+            ... // error
+    } else {
         //update the tag with newer value
+        char auxtype = bam_aux_type(data);
         switch (type) {
             case 'f':
             case 'd':
-    ...
-                if (bam_aux_update_float(bamdata, tag, atof(val))) {
-    ...
+                ...
+                if (bam_aux_update_float(bamdata, tag, atof(val)))
+                    ... // error
             case 'C':
             case 'S':
             case 'I':
-    ...
-                if (bam_aux_update_int(bamdata, tag, atoll(val))) {
-    ...
+                ...
+                if (bam_aux_update_int(bamdata, tag, atoll(val)))
+                    ... // error
             case 'Z':
-    ...
-                if (bam_aux_update_str(bamdata, tag, length, val)) {
-    ...
+                ...
+                if (bam_aux_update_str(bamdata, tag, length, val))
+                    ... // error
             case 'A':
-    ...
+                ...
                 //update the char data directly on buffer
                 *(data+1) = val[0];
-    ...
 Refer: mod_aux.c
 
 Shows the given record's MD tag set to Test.
@@ -743,12 +769,14 @@ Shows the given record's MD tag set to Test.
 The array aux fields can be updated using bam_aux_update_array api.
 
     ...
-    if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt)) {
-    ...
+    if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt))
+        ... // error
 Refer: mod_aux_ba.c
 
 Shows the records updated with an array of integers, containing count of ACGT
-and N in that order.
+and N in that order. The bases are decoded before count for the sake of
+simplicity. Refer qtask_ordered.c for a better counting where decoding is made
+outside the loop.
 
     ./mod_aux_ba samtools/test/mpileup/mpileup.1.bam
 
@@ -761,14 +789,14 @@ can be read easily. There are different type of indices, BAI, CSI, CRAI, TBI,
 FAI etc. and are usually used with iterators.
 
 Indexing of plain/textual files are not supported, compressed SAM&FASTA/Q, BAM,
-and CRAM files can be indexed. CRAM files are indexed as .crai and the other two
-can be indexed as .bai or .csi files. Each of these types have different
-internal representations of the index information. Bai uses a fixed
-configuration values where as csi has them dynamically updated based on the
-alignment data.
+and CRAM files can be indexed. CRAM files are indexed as .crai and the others
+as .bai, .csi, .fai etc. Each of these types have different internal
+representations of the index information. Bai uses a fixed configuration values
+where as csi has them dynamically updated based on the alignment data.
 
 Indexes can be created either with save of alignment data or explicitly by
-read of existing alignment file.
+read of existing alignment file for alignment data (SAM/BAM/CRAM). For reference
+data it has to be explicitly created (FASTA).
 
 To create index along with alignment write, the sam_idx_init api need to be
 invoked before the start of alignment data write. This api takes the output
@@ -777,16 +805,17 @@ index, the min shift has to be 0.
 
 At the end of write, sam_idx_save api need to be invoked to save the index.
 
-    //write header
-    if (sam_hdr_write(outfile, in_samhdr)) {
     ...
+    //write header
+    if (sam_hdr_write(outfile, in_samhdr))
+        ... // error
     // initialize indexing, before start of write
-    if (sam_idx_init(outfile, in_samhdr, size, fileidx)) {
-    ...
-        if (sam_write1(outfile, in_samhdr, bamdata) < 0) {
-    ...
-    if (sam_idx_save(outfile)) {
-    ...
+    if (sam_idx_init(outfile, in_samhdr, size, fileidx))
+        ... // error
+        if (sam_write1(outfile, in_samhdr, bamdata) < 0)
+            ... // error
+    if (sam_idx_save(outfile))
+        ... // error
 Refer:index_write.c
 
 Creates mpileup.1.bam and mpileup.1.bam.bai in /tmp/.
@@ -803,6 +832,20 @@ The sam_index_build2 api takes the index file path as well and gives more
 control than the previous one.  The sam_index_build3 api provides an option to
 configure the number of threads in index creation.
 
+Index for reference data can be created using fai_build3 api. This creates
+index file with .fai extension. If the file is bgzip-ped, a .gzi file is
+created as well. It takes the path to input file and that of fai and gzi files.
+When fai/gzi path are NULL, they are created along with input file.
+These index files will be useful for reference data access.
+
+    ...
+    if (fai_build3(filename, NULL, NULL) == -1)
+        ... // error
+Refer: index_fast.c
+
+A tabix index can be created for compressed vcf/sam/bed and other data using
+tbx_index_build. It is mainly used with vcf and non-sam type files.
+
 
 ### Read with iterators
 
@@ -849,18 +892,19 @@ sam_itr_destroy and hts_idx_destroy apis does this.
 
     ...
     //load index file
-    if (!(idx = sam_index_load2(infile, inname, idxfile))) {
-    ...
+    if (!(idx = sam_index_load2(infile, inname, idxfile)))
+        ... // error
     //create iterator
-    if (!(iter = sam_itr_querys(idx, in_samhdr, region))) {
-    ...
+    if (!(iter = sam_itr_querys(idx, in_samhdr, region)))
+        ... // error
+
     //read using iterator
-    while ((c = sam_itr_next(infile, iter, bamdata)) >= 0) {
-    ...
-    if (iter) {
+    while ((c = sam_itr_next(infile, iter, bamdata)) >= 0)
+        ... // error
+
+    if (iter)
         sam_itr_destroy(iter);
-    }
-    if (idx) {
+    if (idx)
         hts_idx_destroy(idx);
     ...
 Refer:index_reg_read.c
@@ -891,19 +935,20 @@ itself.
 
     ...
     //load index file, assume it to be present in same location
-    if (!(idx = sam_index_load(infile, inname))) {
-    ...
+    if (!(idx = sam_index_load(infile, inname)))
+        ... // error
     //create iterator
-    if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt))) {
-    ...
+    if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt)))
+        ... // error
     if (regions) {
         //can be freed as it is no longer required
         free(regions);
         regions = NULL;
     }
+
     //get required area
-    while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0)) {
-    ...
+    while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0))
+        ... // process bamdata
 Refer:index_multireg_read.c
 
 With compressed sample.sam and 2 regions from reference T1 (30 to 32) and 1
@@ -921,13 +966,70 @@ hts_idx_destroy. The hts_reglist_t* array passed is destroyed by the library
 on iterator destroy. The regions array (array of char array/string) needs to be
 destroyed by the user itself.
 
+For fasta/fastq files, the index has to be loaded using fai_load3_format which
+takes the file, index file names and format. With single region specification
+fai_fetch64 can be used to get bases, and fai_fetchqual64 for quality in case
+of fastq data. With multiple region specification, with comma separation,
+faidx_fetch_seq64 and faidx_fetch_qual64 does the job. Regions has to be parsed
+using fai_parse_region in case of multiregion specifications. fai_adjust_region
+is used to adjust the start-end points based on available data.
+
+Below excerpt shows fasta/q access with single and multiregions,
+
+    ...
+    //load index
+    if (!(idx = fai_load3_format(inname, NULL, NULL, FAI_CREATE, fmt)))
+        ... // error
+
+    ...
+    if (!usemulti) {
+        //get data from single given region
+        if (!(data = fai_fetch64(idx, region, &len)))
+            ... // region not found
+
+        printf("Data: %"PRId64" %s\n", len, data);
+        free((void*)data);
+        //get quality for fastq type
+        if (fmt == FAI_FASTQ) {
+            if (!(data = fai_fetchqual64(idx, region, &len)))
+                ... // region not found
+        ...
+
+    } else { // usemulti
+        //parse, get each region and get data for each
+        while ((remaining = fai_parse_region(idx, region, &tid, &beg, &end, HTS_PARSE_LIST))) {     //here expects regions as csv
+            //parsed the region, correct end points based on actual data
+            if (fai_adjust_region(idx, tid, &beg, &end) == -1)
+                ... // error
+            //get data for given region
+            if (!(data = faidx_fetch_seq64(idx, faidx_iseq(idx, tid), beg, end, &len)))
+                ... // region not found
+
+            printf("Data: %"PRIhts_pos" %s\n", len, data);
+            free((void*)data);
+            data = NULL;
+            //get quality data for fastq
+            if (fmt == FAI_FASTQ) {
+                if (!(data = faidx_fetch_qual64(idx, faidx_iseq(idx, tid), beg, end, &len)))
+                    ... // error
+                printf("Qual: %"PRIhts_pos" %s\n", len, data);
+                free((void*)data);
+            ...
+            region = remaining;                                     //parse remaining region defs
+
+    ...
+    if (idx) {
+        fai_destroy(idx);
+    ...
+Refer: read_fast_index.c
+
 
 ### Pileup and MPileup
 
 Pileup shows the transposed view of the SAM alignment data, i.e. it shows the
-the reference positions and bases which cover that position through different
-reads side by side. MPileup facilitates the piling up of multiple sam files
-against each other and same reference at the same time.
+reference positions and bases which cover that position through different reads
+side by side. MPileup facilitates the piling up of multiple sam files against
+each other and same reference at the same time.
 
 Mpileup has replaced the pileup. The input expects the data to be sorted by
 position.
@@ -978,8 +1080,8 @@ above the cache limit are discarded.
 Once done, the pileup iterator to be discarded by sam_plp_destroy api.
 
     ...
-    if (!(plpiter = bam_plp_init(readdata, &conf))) {
-    ...
+    if (!(plpiter = bam_plp_init(readdata, &conf)))
+        ... // error
     //set constructor destructor callbacks
     bam_plp_constructor(plpiter, plpconstructor);
     bam_plp_destructor(plpiter, plpdestructor);
@@ -1011,7 +1113,7 @@ Once done, the pileup iterator to be discarded by sam_plp_destroy api.
                     printf("?");
                 }
     ...
-    if (plpiter) {
+    if (plpiter)
         bam_plp_destroy(plpiter);
     ...
 Refer:pileup.c
@@ -1067,8 +1169,8 @@ above the cache limit are discarded.
 Once done, the pileup iterator to be discarded by sam_mplp_destroy api.
 
     ...
-    if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf))) {
-    ...
+    if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf)))
+        ... // error
     //set constructor destructor callbacks
     bam_mplp_constructor(mplpiter, plpconstructor);
     bam_mplp_destructor(mplpiter, plpdestructor);
@@ -1134,13 +1236,13 @@ end of processing, the state need to be released using hts_base_mod_state_free
 api.
 
     ...
-    if (!(ms = hts_base_mod_state_alloc())) {
-    ...
+    if (!(ms = hts_base_mod_state_alloc()))
+        ... // error
     while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0)
     {
-    ...
-        if (bam_parse_basemod(bamdata, ms)) {
-    ...
+        ...
+        if (bam_parse_basemod(bamdata, ms))
+            ... // error
         bm = bam_mods_recorded(ms, &cnt);
         for (k = 0; k < cnt; ++k) {
             printf("%c", bm[k]);
@@ -1191,7 +1293,7 @@ api.
             }
         }
     ...
-    if (ms) {
+    if (ms)
         hts_base_mod_state_free(ms);
     ...
 Refer:modstate.c
@@ -1221,7 +1323,7 @@ api.
     {
     ...
     if (!(plpiter = bam_plp_init(readdata, &conf))) {
-    ...
+        ... // error
     //set constructor destructor callbacks
     bam_plp_constructor(plpiter, plpconstructor);
     bam_plp_destructor(plpiter, plpdestructor);
@@ -1238,11 +1340,11 @@ api.
             }
             /*invoke bam mods_mods_at_qpos before bam_plp_insertion_mod that the base modification
             is retrieved before change in pileup pos thr' plp_insertion_mod call*/
-            if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1) {
-    ...
+            if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1)
+                ... // error
             //use plp_insertion/_mod to get insertion and del at the same position
-            if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1) {
-    ...
+            if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1)
+                ... // error
             //start and end are displayed in UPPER and rest on LOWER, only 1st modification considered
             //base and modification
             printf("%c%c%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) :
@@ -1260,7 +1362,7 @@ api.
                     printf("-%d", dellen);
                     for (k = 0; k < dellen; ++k) {
                         printf("?");
-    ...
+                ...
             else if (plp[j].indel < 0) {
                 //deletion
                 printf("%d", plp[j].indel);
@@ -1285,17 +1387,18 @@ data and a combination of flags for the required fields can be passed with
 CRAM_OPT_REQUIRED_FIELDS to this api.
 
     ...
-       //select required field alone, this is useful for CRAM alone
-       if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0) {
-    ...
-       //read header
-       in_samhdr = sam_hdr_read(infile);
+    //select required field alone, this is useful for CRAM alone
+    if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0)
+        ... // error
+
+    //read header
+    in_samhdr = sam_hdr_read(infile);
     ...
     //read data, check flags and update count
     while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
-        if (bamdata->core.flag & BAM_FREAD1) {
+        if (bamdata->core.flag & BAM_FREAD1)
             cntread1++;
-    ...
+        ...
 Refer: flags_htsopt_field.c
 
 
@@ -1303,48 +1406,248 @@ Refer: flags_htsopt_field.c
 
 The HTSLib api supports thread pooling for better performance. There are a few
 ways in which this can be used. The pool can be made specific for a file or a
-generic pool can be created and shared across multiple files. Another way to
-use thread pool is to schedule tasks explicitly to queues which gets executed
-using threads in pool.
+generic pool can be created and shared across multiple files. Thread pool can
+also be used to execute user defined tasks. The tasks are to be added to queue,
+threads in pool executes them and results can be queued back if required.
 
 To have a thread pool specific for a file, hts_set_opt api can be used with the
-file pointer, HTS_OPT_NTHREADS and the number of threads to use in the pool.
-Closure of file releases the thread pool as well. To have a thread pool which
-can be shared across different files, it needs to be initialized using
-hts_tpool_init api, passing number of threads as argument. This thread pool can
-be associated with a file using hts_set_opt api. The file pointer,
-HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments
-to api. The thread pool has to be released with hts_tpool_destroy.
+file pointer, HTS_OPT_NTHREADS and the number of threads to be in the pool.
+Thread pool is released on closure of file. To have a thread pool which can be
+shared across different files, it needs to be initialized using hts_tpool_init
+api, passing number of threads as an argument. This thread pool can be
+associated with a file using hts_set_opt api. The file pointer,
+HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments to
+the api. The thread pool has to be released with hts_tpool_destroy.
+
+The samples are trivial ones to showcase the usage of api. The number of threads
+to use for different tasks has to be identified based on complexity and
+parallelism of the task.
 
 Below excerpt shows file specific thread pool,
 
     ...
     //create file specific threads
-    if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 ||     //2 thread specific for reading
+    if (hts_set_opt(infile, HTS_OPT_NTHREADS, 1) < 0 ||     //1 thread specific for reading
     hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 ||       //1 thread specific for sam write
-    hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) {       //1 thread specific for bam write
+    hts_set_opt(outfile2, HTS_OPT_NTHREADS, 2) < 0) {       //2 thread specific for bam write
         printf("Failed to set thread options\n");
         goto end;
     }
 Refer: split_thread1.c
 
-Below excerpt shows thread pool shared across files,
+Below excerpt shows a thread pool shared across files,
 
     ...
     //create a pool of 4 threads
-    if (!(tpool.pool = hts_tpool_init(4))) {
-    ...
+    if (!(tpool.pool = hts_tpool_init(4)))
+        ... // error
     //share the pool with all the 3 files
     if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 ||
     hts_set_opt(outfile1, HTS_OPT_THREAD_POOL, &tpool) < 0 ||
     hts_set_opt(outfile2, HTS_OPT_THREAD_POOL, &tpool) < 0) {
-    ...
-    if (tpool.pool) {
+        ... // error
+
+    ... // do something
+
+    //tidy up at end
+    if (tpool.pool)
         hts_tpool_destroy(tpool.pool);
-    }
     ...
 Refer: split_thread2.c
 
+Note that it is important to analyze the task in hand to decide the number of
+threads to be used. As an example, if the number of threads for reading is set
+to 2 and bam write to 1, keeping total number of threads the same, the
+performance may decrease as bam decoding is easier than encoding.
+
+Custom task / user defined functions can be performed on data using thread pool
+and for that, the task has to be scheduled to a queue. Thread pool associated
+with the queue will perform the task. There can be multiple pools and queues.
+The order of execution of threads are decided based on many factors and load on
+each task may vary, so the completion of the tasks may not be in the order of
+their queueing. The queues can be used in two different ways, one where the
+result is enqueued to queue again to be read in same order as initial queueing,
+second where the resuls are not enqueued and completed possibly in a different
+order than initial queueing. Explicitly created threads can also be used along
+with hts thread pool usage.
+
+hts_tpool_process_init initializes the queue / process, associates a queue with
+thread pool and reserves space for given number of tasks on queue. It takes a
+parameter indicating whether the result need to be enqueued for retrieval or
+not. If the result is enqueued, it is retrieved in the order of scheduling of
+task. Another parameter sets the maximum number of slots for tasks in queue,
+usually 2 times the number of threads are used. The input and output have their
+own queues and they grow as required upto the max set. hts_tpool_dispatch api
+enqueues the task to the queue. The api blocks when there is no space in queue.
+This behavior can be controlled with hts_tpool_dispatch2 api. The queue can be
+reset using hts_tpool_process_reset api where all tasks are discarded. The api
+hts_tpool_dispatch3 supports configuring cleanup routines which are to be run
+when reset occurs on the queue. hts_tpool_process_flush api can ensure that
+all the piled up tasks are processed, a possible case when the queueing and
+processing happen at different speeds. hts_tpool_process_shutdown api stops the
+processing of queue.
+
+There are a few apis which let the user to check the status of processing. The
+api hts_tpool_process_empty shows whether all the tasks are completed or not.
+The api hts_tpool_process_sz gives the number of tasks, at different states of
+processing. The api hts_tpool_process_len gives the number of results in output
+queue waiting to be collected.
+
+The order of execution of tasks depends on the number of threads involved and
+how the threads are scheduled by operating system. When the results are enqueued
+back to queue, they are read in same order of enqueueing of task and in that
+case the order of execution will not be noticed. When the results are not
+enqueued the results are available right away and the order of execution may be
+noticeable. Based on the nature of task and the need of order maintenance, users
+can select either of the queueing.
+
+Below excerpts shows the usage of queues and threads in both cases. In the 1st,
+alignments are updated with an aux tag indicating GC ratio. The order of data
+has to be maintained even after update, hence the result queueing is used to
+ensure same order as initial. A number of alignments are bunched together and
+reuse of allocated memory is made to make it perform better. A sentinel job is
+used to identify the completion of all tasks at the result collection side.
+    ...
+    void *thread_ordered_proc(void *args)
+    {
+        ...
+        for ( i = 0; i < bamdata->count; ++i) {
+            ...
+            for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos)
+                count[bam_seqi(data,pos)]++;
+            ...
+            gcratio = (count[2] /*C*/ + count[4] /*G*/) / (float) (count[1] /*A*/ + count[8] /*T*/ + count[2] + count[4]);
+
+            if (bam_aux_append(bamdata->bamarray[i], "xr", 'f', sizeof(gcratio), (const uint8_t*)&gcratio) < 0) {
+
+    ...
+    void *threadfn_orderedwrite(void *args)
+    {
+        ...
+        //get result and write; wait if no result is in queue - until shutdown of queue
+        while (tdata->result == 0 &&
+            (r = hts_tpool_next_result_wait(tdata->queue)) != NULL) {
+            bamdata = (data*) hts_tpool_result_data(r);
+            ...
+            for (i = 0; i < bamdata->count; ++i) {
+                if (sam_write1(tdata->outfile, tdata->samhdr, bamdata->bamarray[i]) < 0) {
+                    ... // error
+            ...
+            hts_tpool_delete_result(r, 0);              //release the result memory
+            ...
+
+        // Shut down the process queue.  If we stopped early due to a write failure,
+        // this will signal to the other end that something has gone wrong.
+        hts_tpool_process_shutdown(tdata->queue);
+
+    ...
+    int main(int argc, char *argv[])
+    {
+        ...
+        if (!(pool = hts_tpool_init(cnt)))                  //thread pool
+            ... // error
+        tpool.pool = pool;      //to share the pool for file read and write as well
+        //queue to use with thread pool, for task and results
+        if (!(queue = hts_tpool_process_init(pool, cnt * 2, 0))) {
+    ...
+        //share the thread pool with i/o files
+        if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 ||
+            hts_set_opt(outfile, HTS_OPT_THREAD_POOL, &tpool) < 0)
+            ... // error
+        if (pthread_create(&thread, NULL, threadfn_orderedwrite, &twritedata))
+            ... // error
+        while (c >= 0) {
+            if (!(bamdata = getbamstorage(chunk, &bamcache)))
+                ... // error
+            for (cnt = 0; cnt < bamdata->maxsize; ++cnt) {
+                c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]);
+                ...
+                if (hts_tpool_dispatch3(pool, queue, thread_ordered_proc, bamdata,
+                                        cleanup_bamstorage, cleanup_bamstorage,
+                                        0) == -1)
+                    ... // error
+        ...
+        if (queue) {
+            if (-1 == c) {
+                // EOF read, send a marker to tell the threadfn_orderedwrite()
+                // function to shut down.
+                if (hts_tpool_dispatch(pool, queue, thread_ordered_proc,
+                                    NULL) == -1) {
+                    ... // error
+                hts_tpool_process_shutdown(queue);
+
+        ...
+        // Wait for threadfn_orderedwrite to finish.
+        if (started_thread) {
+            pthread_join(thread, NULL);
+
+        ...
+        if (queue) {
+            // Once threadfn_orderedwrite has stopped, the queue can be
+            // cleaned up.
+            hts_tpool_process_destroy(queue);
+        }
+    ...
+Refer: qtask_ordered.c
+
+In this 2nd, the bases are counted and GC ratio of whole file is calculated.
+Order in which bases are counted is not relevant and no result queue required.
+The queue is created as input only.
+    ...
+    void *thread_unordered_proc(void *args)
+    {
+        ...
+        for ( i = 0; i < bamdata->count; ++i) {
+            data = bam_get_seq(bamdata->bamarray[i]);
+            for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos)
+                counts[bam_seqi(data, pos)]++;
+
+        ...
+        //update result and add the memory block for reuse
+        pthread_mutex_lock(&bamdata->cache->lock);
+        for (i = 0; i < 16; i++) {
+            bamdata->bases->counts[i] += counts[i];
+        }
+
+        bamdata->next = bamdata->cache->list;
+        bamdata->cache->list = bamdata;
+        pthread_mutex_unlock(&bamdata->cache->lock);
+
+    ...
+    int main(int argc, char *argv[])
+    {
+        ...
+        if (!(queue = hts_tpool_process_init(pool, cnt * 2, 1)))
+            ... // error
+        c = 0;
+        while (c >= 0) {
+            ...
+            for (cnt = 0; cnt < bamdata->maxsize; ++cnt) {
+                c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]);
+
+            ...
+            if (c >= -1 ) {
+                ...
+                if (hts_tpool_dispatch3(pool, queue, thread_unordered_proc, bamdata,
+                                        cleanup_bamstorage, cleanup_bamstorage,
+                                        0) == -1)
+                    ... // error
+        ...
+        if (-1 == c) {
+            // EOF read, ensure all are processed, waits for all to finish
+            if (hts_tpool_process_flush(queue) == -1) {
+                fprintf(stderr, "Failed to flush queue\n");
+            } else { //all done
+                //refer seq_nt16_str to find position of required bases
+                fprintf(stdout, "GCratio: %f\nBase counts:\n",
+                    (gccount.counts[2] /*C*/ + gccount.counts[4] /*G*/) / (float)
+                        (gccount.counts[1] /*A*/ + gccount.counts[8] /*T*/ +
+                            gccount.counts[2] + gccount.counts[4]));
+        ...
+        if (queue) {
+            hts_tpool_process_destroy(queue);
+        }
+Refer: qtask_unordered.c
 
 ## More Information
 
@@ -1421,9 +1724,9 @@ be destroyed as many times with sam_hdr_destroy api.
 ### Index
 
 Indices need the data to be sorted by position.  They can be of different
-types with extension .bai, .csi or .tbi for compressed SAM/BAM files and .crai
-for CRAM files.  The index name can be passed along with the alignment file
-itself by appending a specific character sequence. The apis can detect this
+types with extension .bai, .csi or .tbi for compressed SAM/BAM/VCF files and
+.crai for CRAM files.  The index name can be passed along with the alignment
+file itself by appending a specific character sequence. The apis can detect this
 sequence and extract the index path. ##idx## is the sequence which separates
 the file path and index path.
 
diff --git a/samples/Makefile b/samples/Makefile
index 40991d78f..ecbede4c5 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -2,7 +2,7 @@ HTS_DIR = ../
 include $(HTS_DIR)/htslib_static.mk
 
 CC = gcc
-CFLAGS = -Wall -g -O0
+CFLAGS = -Wall -O2
 
 #to statically link to libhts
 LDFLAGS = $(HTS_DIR)/libhts.a -L$(HTS_DIR) $(HTSLIB_static_LDFLAGS) $(HTSLIB_static_LIBS)
@@ -13,91 +13,104 @@ LDFLAGS = $(HTS_DIR)/libhts.a -L$(HTS_DIR) $(HTSLIB_static_LDFLAGS) $(HTSLIB_sta
 PRGS = flags split split2 cram read_fast read_header read_ref read_bam \
 	read_aux dump_aux add_header rem_header update_header mod_bam mod_aux \
 	mod_aux_ba write_fast idx_on_write read_reg read_multireg pileup \
-	mpileup modstate pileup_mod flags_field split_t1 split_t2
+	mpileup modstate pileup_mod flags_field split_t1 split_t2 \
+	read_fast_i qtask_ordered qtask_unordered index_fasta
 
 all: $(PRGS)
 
-flags:
+flags: flags_demo.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) flags_demo.c -o $@  $(LDFLAGS)
 
-split:
+split: split.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) split.c -o $@  $(LDFLAGS)
 
-split2:
+split2: split2.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) split2.c -o $@  $(LDFLAGS)
 
-cram:
+cram: cram.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) cram.c -o $@  $(LDFLAGS)
 
-read_fast:
+read_fast: read_fast.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) read_fast.c -o $@  $(LDFLAGS)
 
-read_header:
+read_header: read_header.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) read_header.c -o $@  $(LDFLAGS)
 
-read_ref:
+read_ref: read_refname.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) read_refname.c -o $@  $(LDFLAGS)
 
-read_bam:
+read_bam: read_bam.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) read_bam.c -o $@  $(LDFLAGS)
 
-read_aux:
+read_aux: read_aux.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) read_aux.c -o $@  $(LDFLAGS)
 
-dump_aux:
+dump_aux: dump_aux.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) dump_aux.c -o $@  $(LDFLAGS)
 
-add_header:
+add_header: add_header.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) add_header.c -o $@  $(LDFLAGS)
 
-rem_header:
+rem_header: rem_header.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) rem_header.c -o $@  $(LDFLAGS)
 
-update_header:
+update_header: update_header.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) update_header.c -o $@  $(LDFLAGS)
 
-mod_bam:
+mod_bam: mod_bam.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) mod_bam.c -o $@  $(LDFLAGS)
 
-mod_aux:
+mod_aux: mod_aux.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux.c -o $@  $(LDFLAGS)
 
-mod_aux_ba:
+mod_aux_ba: mod_aux_ba.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux_ba.c -o $@  $(LDFLAGS)
 
-write_fast:
+write_fast: write_fast.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) write_fast.c -o $@  $(LDFLAGS)
 
-idx_on_write:
+idx_on_write: index_write.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) index_write.c -o $@  $(LDFLAGS)
 
-read_reg:
+read_reg: index_reg_read.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) index_reg_read.c -o $@  $(LDFLAGS)
 
-read_multireg:
+read_multireg: index_multireg_read.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) index_multireg_read.c -o $@  $(LDFLAGS)
 
-pileup:
+read_fast_i: read_fast_index.c
+	$(CC) $(CFLAGS) -I $(HTS_DIR) read_fast_index.c -o $@  $(LDFLAGS)
+
+pileup: pileup.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) pileup.c -o $@  $(LDFLAGS)
 
-mpileup:
+mpileup: mpileup.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) mpileup.c -o $@  $(LDFLAGS)
 
-modstate:
+modstate: modstate.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) modstate.c -o $@  $(LDFLAGS)
 
-pileup_mod:
+pileup_mod: pileup_mod.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) pileup_mod.c -o $@  $(LDFLAGS)
 
-flags_field:
+flags_field: flags_htsopt_field.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) flags_htsopt_field.c -o $@  $(LDFLAGS)
 
-split_t1:
+split_t1: split_thread1.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) split_thread1.c -o $@  $(LDFLAGS)
 
-split_t2:
+split_t2: split_thread2.c
 	$(CC) $(CFLAGS) -I $(HTS_DIR) split_thread2.c -o $@  $(LDFLAGS)
 
+index_fasta: index_fasta.c
+	$(CC) $(CFLAGS) -I $(HTS_DIR) index_fasta.c -o $@  $(LDFLAGS)
+
+qtask_ordered: qtask_ordered.c
+	$(CC) $(CFLAGS) -I $(HTS_DIR) qtask_ordered.c -o $@  $(LDFLAGS)
+
+qtask_unordered: qtask_unordered.c
+	$(CC) $(CFLAGS) -I $(HTS_DIR) qtask_unordered.c -o $@  $(LDFLAGS)
+
 clean:
 	find . -name "*.o" | xargs rm -rf
 	find . -name "*.dSYM" | xargs rm -rf
diff --git a/samples/README.md b/samples/README.md
index ab5481dea..6f90c0c3f 100644
--- a/samples/README.md
+++ b/samples/README.md
@@ -4,7 +4,7 @@ data, and is the core library used by [samtools][2] and [bcftools][3].
 
 A set of sample programs are available which showcases the usage of APIs in HTSlib.
 They are based on version 1.17 of HTSLib and are mainly for demonstration of API usage.
-Further optimization and error handling might be required for actual usage.
+Further optimisation and error handling might be required for actual usage.
 
 
 [1]: http://samtools.github.io/hts-specs/
@@ -61,7 +61,7 @@ indexed.
 
 [Read_fast][Read_fast]
 
-  This application showcases the fasta/fastq data read.
+  This application showcases fasta/fastq data read without using index.
 
 [Read_header][Read_header]
 
@@ -72,7 +72,7 @@ indexed.
 [Read_ref][Read_ref]
 
   This application showcases the read and access of header data. It shows
-  all reference names which has length equal or greather to given input.
+  all reference names which has length equal or greater to given input.
 
 [Read_bam][Read_bam]
 
@@ -129,14 +129,18 @@ indexed.
 
 [Write_fast][Write_fast]
 
-  This application showcases the fasta/fastq data write. It appends a dummy
-  data to given file.
+  This application showcases the fasta/fastq data write. It appends data on
+  given file.
 
 [Index_write][Index_write]
 
   This application showcases the creation of index along with output
   creation. Based on file type and shift, it creates bai, csi or crai files.
 
+[Index_fast][Index_fast]
+
+  This application showcases index creation on fasta/fastq reference data.
+
 [Read_reg][Read_reg]:
 
   This application showcases the usage of region specification in alignment
@@ -144,9 +148,14 @@ indexed.
 
 [Read_multireg][Read_multireg]:
 
-  This application showcases the usage of mulitple region specification in
+  This application showcases the usage of multiple region specification in
   alignment read.
 
+[Read_fast_index][Read_fast_index]
+
+  This application showcases the fasta/fastq data read using index. It takes a
+  region (reference name[:start-end]) and gets data from that region.
+
 [Pileup][Pileup]:
 
   This application showcases the pileup api, where all alignments covering a
@@ -181,8 +190,7 @@ indexed.
 
   This application showcases the use of threads in file handling. It saves
   the read1 and read2 as separate files in given directory, one as sam and
-  other as bam. 2 threads are used for read and 1 each dedicated for each
-  output file.
+  other as bam. 1 thread is used for read, 1 for sam write and 2 for bam write.
 
 [Split_thread2][Split_thread2]
 
@@ -191,6 +199,19 @@ indexed.
   and other as bam. A pool of 4 threads is created and shared for both read
   and write.
 
+[Qtask_ordered][Qtask_ordered]
+
+  This application showcases the use of queues and threads for custom
+  processing. Alignments in input file are updated with their GC ratio on a
+  custom aux tag. The processing may occur in any order but the results are
+  retrieved in same order as it was queued and saved to disk.
+
+[Qtask_unordered][Qtask_unordered]
+
+  This application showcases the use of queues and threads for custom
+  processing. The count of bases and GC ratio are calculated and displayed.
+  The order of counting is irrelevant and hence ordered retrieval is not used.
+
 ### More Information
 
 More detailed documentation is available in the [DEMO.md][DEMO] with worked
@@ -215,8 +236,10 @@ examples per demonstration tool.
 [Mod_aux_ba]: mod_aux_ba.c
 [Write_fast]: write_fast.c
 [Index_write]: index_write.c
+[Index_fasta]: index_fasta.c
 [Read_reg]: index_reg_read.c
 [Read_multireg]: index_multireg_read.c
+[Read_fast_index]: read_fast_index.c
 [Pileup]: pileup.c
 [Mpileup]: mpileup.c
 [Modstate]: modstate.c
@@ -224,4 +247,6 @@ examples per demonstration tool.
 [Flags_field]: flags_htsopt_field.c
 [Split_thread1]: split_thread1.c
 [Split_thread2]: split_thread2.c
+[Qtask_ordered]: qtask_ordered.c
+[Qtask_unordered]: qtask_unordered.c
 [DEMO]: DEMO.md
diff --git a/samples/add_header.c b/samples/add_header.c
index d1a2fc13c..066b1d438 100644
--- a/samples/add_header.c
+++ b/samples/add_header.c
@@ -24,20 +24,20 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
 {
     fprintf(fp, "Usage: add_header infile\n\
-Adds new header lines of SQ, RG, PG and CO typs\n");
+Adds new header lines of SQ, RG, PG and CO types\n");
     return;
 }
 
diff --git a/samples/cram.c b/samples/cram.c
index 5f55e65d2..7b1342377 100644
--- a/samples/cram.c
+++ b/samples/cram.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/dump_aux.c b/samples/dump_aux.c
index 49251fe04..3caa16027 100644
--- a/samples/dump_aux.c
+++ b/samples/dump_aux.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
@@ -92,7 +92,7 @@ int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data)
         fprintf(fp, "%c", auxBType);
         for (i = 0; i < auxBcnt; ++i) {                                                     //iterate the array
             fprintf(fp, ",");
-            //calling recurssively  with index to reuse a few lines
+            //calling recursively  with index to reuse a few lines
             if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) {
                 return EXIT_FAILURE;
             }
diff --git a/samples/flags_demo.c b/samples/flags_demo.c
index e03fc6cd8..ac26be86c 100644
--- a/samples/flags_demo.c
+++ b/samples/flags_demo.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/flags_htsopt_field.c b/samples/flags_htsopt_field.c
index 4b64445e3..40a0affc4 100644
--- a/samples/flags_htsopt_field.c
+++ b/samples/flags_htsopt_field.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/index_fasta.c b/samples/index_fasta.c
new file mode 100644
index 000000000..ba0489094
--- /dev/null
+++ b/samples/index_fasta.c
@@ -0,0 +1,72 @@
+/*  index_fasta.c --  showcases the htslib api usage
+
+    Copyright (C) 2024 Genome Research Ltd.
+
+    Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+
+*/
+
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
+
+#include <getopt.h>
+#include <unistd.h>
+#include <time.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
+returns nothing
+*/
+static void print_usage(FILE *fp)
+{
+    fprintf(fp, "Usage: index_fasta <file>\n\
+Indexes a fasta/fastq file and saves along with source.\n");
+    return;
+}
+
+/// main - indexes fasta/fastq file
+/** @param argc - count of arguments
+ *  @param argv - pointer to array of arguments
+returns 1 on failure 0 on success
+*/
+int main(int argc, char *argv[])
+{
+    const char *filename = NULL;             //file name
+    int ret = EXIT_FAILURE;
+
+    if (argc != 2) {
+        print_usage(stdout);
+        goto end;
+    }
+    filename = argv[1];
+
+    // index the file
+    if (fai_build3(filename, NULL, NULL) == -1) {
+        printf("Indexing failed with %d\n", errno);
+        goto end;
+    }
+    //this creates an .fai file. If the file is bgzipped, a .gzi file will be created along with .fai
+    ret = EXIT_SUCCESS;
+end:
+    //clean up
+    return ret;
+}
diff --git a/samples/index_multireg_read.c b/samples/index_multireg_read.c
index dbe8f15f9..7bb864990 100644
--- a/samples/index_multireg_read.c
+++ b/samples/index_multireg_read.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the print_usage
-/** @param fp pointer to the file / terminal to which print_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/index_reg_read.c b/samples/index_reg_read.c
index 346d5428f..dec684933 100644
--- a/samples/index_reg_read.c
+++ b/samples/index_reg_read.c
@@ -24,19 +24,19 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the print_usage
-/** @param fp pointer to the file / terminal to which print_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
 {
-    fprintf(fp, "Usage: readreg infile idxfile region\n\
+    fprintf(fp, "Usage: read_reg infile idxfile region\n\
 Reads alignments matching to a specific region\n\
 \\. from start of file\n\
 \\* only unmapped reads\n\
diff --git a/samples/index_write.c b/samples/index_write.c
index 8fd2bc968..9ec63d4ad 100644
--- a/samples/index_write.c
+++ b/samples/index_write.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <libgen.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/mod_aux.c b/samples/mod_aux.c
index b6e75fb0b..ae531b985 100644
--- a/samples/mod_aux.c
+++ b/samples/mod_aux.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <strings.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/mod_aux_ba.c b/samples/mod_aux_ba.c
index 8ef90ee1e..836a3d39c 100644
--- a/samples/mod_aux_ba.c
+++ b/samples/mod_aux_ba.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/mod_bam.c b/samples/mod_bam.c
index 6c56e62d1..616639610 100644
--- a/samples/mod_bam.c
+++ b/samples/mod_bam.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <strings.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/modstate.c b/samples/modstate.c
index 976391684..4d5f67635 100644
--- a/samples/modstate.c
+++ b/samples/modstate.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/mpileup.c b/samples/mpileup.c
index fe933748e..ecab70584 100644
--- a/samples/mpileup.c
+++ b/samples/mpileup.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <ctype.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/pileup.c b/samples/pileup.c
index 11e2fb02f..be7aad801 100644
--- a/samples/pileup.c
+++ b/samples/pileup.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <ctype.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/pileup_mod.c b/samples/pileup_mod.c
index 24d6cf539..81ac5a540 100644
--- a/samples/pileup_mod.c
+++ b/samples/pileup_mod.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <ctype.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/qtask_ordered.c b/samples/qtask_ordered.c
new file mode 100644
index 000000000..a76d59826
--- /dev/null
+++ b/samples/qtask_ordered.c
@@ -0,0 +1,425 @@
+/*  qtask_ordered.c --  showcases the htslib api usage
+
+    Copyright (C) 2024 Genome Research Ltd.
+
+    Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+
+*/
+
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
+
+#include <getopt.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <htslib/sam.h>
+#include <htslib/thread_pool.h>
+
+typedef struct data {
+    int count;                  //used up size
+    int maxsize;                //max size per data chunk
+    bam1_t **bamarray;          //bam1_t array for optimal queueing
+    struct data *next;          //pointer to next one - to reuse earlier allocations
+} data;
+
+typedef struct datacache
+{
+    pthread_mutex_t lock;       //synchronizes the access to cache
+    data *list;                 //data storage
+} datacache;
+
+typedef struct orderedwrite {
+    samFile *outfile;           //output file handle
+    sam_hdr_t *samhdr;          //header used to write data
+    hts_tpool_process *queue;   //queue from which results to be retrieved
+    datacache *cache;           //to re-use allocated storage
+    int result;                 //result code returned by writer thread
+} orderedwrite;
+
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
+returns nothing
+*/
+static void print_usage(FILE *fp)
+{
+    fprintf(fp, "Usage: qtask_ordered infile threadcount outdir [chunksize]\n\
+Calculates GC ratio - sum(G,C) / sum(A,T,C,G) - and adds to each alignment\n\
+as xr:f aux tag. Output is saved in outdir.\n\
+chunksize [4096] sets the number of alignments clubbed together to process.\n");
+    return;
+}
+
+/// getbamstorage - allocates storage for alignments to queue
+/** @param chunk number of bam data to allocate
+ * @param bamcache cached storage
+returns already allocated data storage if one is available, otherwise allocates new
+*/
+data* getbamstorage(int chunk, datacache *bamcache)
+{
+    int i = 0;
+    data *bamdata = NULL;
+
+    if (!bamcache) {
+        return NULL;
+    }
+    //get from cache if there is an already allocated storage
+    if (pthread_mutex_lock(&bamcache->lock)) {
+        return NULL;
+    }
+    if (bamcache->list) {                   //available
+        bamdata = bamcache->list;
+        bamcache->list = bamdata->next;     //remove and set next one as available
+        bamdata->next = NULL;               //remove link
+        bamdata->count = 0;
+        goto end;
+    }
+    //allocate and use
+    if (!(bamdata = malloc(sizeof(data)))) {
+        goto end;
+    }
+    bamdata->bamarray = malloc(chunk * sizeof(bam1_t*));
+    if (!bamdata->bamarray) {
+        free(bamdata);
+        bamdata = NULL;
+        goto end;
+    }
+    for (i = 0; i < chunk; ++i) {
+        bamdata->bamarray[i] = bam_init1();
+    }
+    bamdata->maxsize = chunk;
+    bamdata->count = 0;
+    bamdata->next = NULL;
+
+end:
+    pthread_mutex_unlock(&bamcache->lock);
+    return bamdata;
+}
+
+/// cleanup_bamstorage - frees a bamdata struct plus contents
+/** @param arg Pointer to data to free
+    @p arg has type void * so it can be used as a callback passed
+    to hts_tpool_dispatch3().
+ */
+void cleanup_bamstorage(void *arg)
+{
+    data *bamdata = (data *) arg;
+    if (!bamdata)
+        return;
+    if (bamdata->bamarray) {
+        int i;
+        for (i = 0; i < bamdata->maxsize; i++) {
+            bam_destroy1(bamdata->bamarray[i]);
+        }
+        free(bamdata->bamarray);
+    }
+    free(bamdata);
+}
+
+/// thread_ordered_proc - does the processing of task in queue and queues the output back
+/** @param args pointer to set of data to be processed
+returns the processed data
+the processing could be in any order based on the number of threads in use but read of output
+from queue will be in order
+a null data indicates the end of input and a null is returned to be added back to result queue
+*/
+void *thread_ordered_proc(void *args)
+{
+    int i = 0, pos = 0;
+    data *bamdata = (data*)args;
+    float gcratio = 0;
+    uint8_t *data = NULL;
+
+    if (bamdata == NULL)
+        return NULL; // Indicates no more input
+
+    for ( i = 0; i < bamdata->count; ++i) {
+        //add count
+        uint64_t count[16] = {0};
+        data = bam_get_seq(bamdata->bamarray[i]);
+        for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos) {
+            count[bam_seqi(data,pos)]++;
+        }
+        /*it is faster to count all and use offset to get required counts rather than select
+        require ones inside the loop*/
+        gcratio = (count[2] /*C*/ + count[4] /*G*/) / (float) (count[1] /*A*/ + count[8] /*T*/ + count[2] + count[4]);
+
+        if (bam_aux_append(bamdata->bamarray[i], "xr", 'f', sizeof(gcratio), (const uint8_t*)&gcratio) < 0) {
+            fprintf(stderr, "Failed to add aux tag xr, errno: %d\n", errno);
+            break;
+        }
+    }
+    return bamdata;
+}
+
+/// threadfn_orderedwrite - thread that read the output from queue and writes
+/** @param args pointer to data specific for the thread
+returns NULL
+*/
+void *threadfn_orderedwrite(void *args)
+{
+    orderedwrite *tdata = (orderedwrite*)args;
+    hts_tpool_result *r = NULL;
+    data *bamdata = NULL;
+    int i = 0;
+
+    tdata->result = 0;
+
+    //get result and write; wait if no result is in queue - until shutdown of queue
+    while (tdata->result == 0 &&
+           (r = hts_tpool_next_result_wait(tdata->queue)) != NULL) {
+        bamdata = (data*) hts_tpool_result_data(r);
+
+        if (bamdata == NULL) {
+            // Indicator for no more input. Time to stop.
+            hts_tpool_delete_result(r, 0);
+            break;
+        }
+
+        for (i = 0; i < bamdata->count; ++i) {
+            if (sam_write1(tdata->outfile, tdata->samhdr, bamdata->bamarray[i]) < 0) {
+                fprintf(stderr, "Failed to write output data\n");
+                tdata->result = -1;
+                break;
+            }
+        }
+        hts_tpool_delete_result(r, 0);              //release the result memory
+
+        pthread_mutex_lock(&tdata->cache->lock);
+        bamdata->next = tdata->cache->list;         //make current list as next
+        tdata->cache->list = bamdata;               //set as current to reuse
+        pthread_mutex_unlock(&tdata->cache->lock);
+    }
+
+    // Shut down the process queue.  If we stopped early due to a write failure,
+    // this will signal to the other end that something has gone wrong.
+    hts_tpool_process_shutdown(tdata->queue);
+
+    return NULL;
+}
+
+/// main_demo - start of the demo
+/** @param argc - count of arguments
+ *  @param argv - pointer to array of arguments
+returns 1 on failure 0 on success
+*/
+int main(int argc, char *argv[])
+{
+    const char *inname = NULL, *outdir = NULL;
+    char *file = NULL;
+    int c = 0, ret = EXIT_FAILURE, cnt = 0, started_thread = 0, chunk = 0;
+    size_t size = 0;
+    samFile *infile = NULL, *outfile = NULL;
+    sam_hdr_t *in_samhdr = NULL;
+    pthread_t thread;
+    orderedwrite twritedata = {0};
+    hts_tpool *pool = NULL;
+    hts_tpool_process *queue = NULL;
+    htsThreadPool tpool = {NULL, 0};
+    data *bamdata = NULL;
+    datacache bamcache = {PTHREAD_MUTEX_INITIALIZER, NULL};
+
+    //qtask infile threadcount outdir [chunksize]
+    if (argc != 4 && argc != 5) {
+        print_usage(stdout);
+        goto end;
+    }
+    inname = argv[1];
+    cnt = atoi(argv[2]);
+    outdir = argv[3];
+    if (argc == 5) {    //chunk size present
+        chunk = atoi(argv[4]);
+    }
+    if (cnt < 1) {      //set proper thread count
+        cnt = 1;
+    }
+    if (chunk < 1) {    //set valid  chunk size
+        chunk = 4096;
+    }
+
+    //allocate space for output
+    size = (strlen(outdir) + sizeof("/out.bam") + 1);   //space for output file name and null termination
+    if (!(file = malloc(size))) {
+        fprintf(stderr, "Failed to set output path\n");
+        goto end;
+    }
+    snprintf(file, size, "%s/out.bam", outdir);         //output file name
+    if (!(pool = hts_tpool_init(cnt))) {                //thread pool
+        fprintf(stderr, "Failed to create thread pool\n");
+        goto end;
+    }
+    tpool.pool = pool;      //to share the pool for file read and write as well
+    //queue to use with thread pool, for task and results
+    if (!(queue = hts_tpool_process_init(pool, cnt * 2, 0))) {
+        fprintf(stderr, "Failed to create queue\n");
+        goto end;
+    }
+    //open input file - r reading
+    if (!(infile = sam_open(inname, "r"))) {
+        fprintf(stderr, "Could not open %s\n", inname);
+        goto end;
+    }
+    //open output files - w write as SAM, wb  write as BAM
+    if (!(outfile = sam_open(file, "wb"))) {
+        fprintf(stderr, "Could not open output file\n");
+        goto end;
+    }
+    //share the thread pool with i/o files
+    if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 ||
+          hts_set_opt(outfile, HTS_OPT_THREAD_POOL, &tpool) < 0) {
+        fprintf(stderr, "Failed to set threads to i/o files\n");
+        goto end;
+    }
+    //read header, required to resolve the target names to proper ids
+    if (!(in_samhdr = sam_hdr_read(infile))) {
+        fprintf(stderr, "Failed to read header from file!\n");
+        goto end;
+    }
+    //write header
+    if ((sam_hdr_write(outfile, in_samhdr) == -1)) {
+        fprintf(stderr, "Failed to write header\n");
+        goto end;
+    }
+
+    /* tasks are queued, worker threads get them and process in parallel;
+    the results are queued and they are to be removed in parallel as well */
+
+    // start output writer thread for ordered processing
+    twritedata.outfile = outfile;
+    twritedata.samhdr  = in_samhdr;
+    twritedata.result  = 0;
+    twritedata.queue   = queue;
+    twritedata.cache   = &bamcache;
+    if (pthread_create(&thread, NULL, threadfn_orderedwrite, &twritedata)) {
+        fprintf(stderr, "Failed to create writer thread\n");
+        goto end;
+    }
+    started_thread = 1;
+
+    c = 0;
+    while (c >= 0) {
+        if (!(bamdata = getbamstorage(chunk, &bamcache))) {
+            fprintf(stderr, "Failed to allocate memory\n");
+            break;
+        }
+        //read alignments, upto max size for this lot
+        for (cnt = 0; cnt < bamdata->maxsize; ++cnt) {
+            c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]);
+            if (c < 0) {
+                break;      // EOF or failure
+            }
+        }
+        if (c >= -1 ) {
+            //max size data or reached EOF
+            bamdata->count = cnt;
+            // Queue the data for processing.  hts_tpool_dispatch3() is
+            // used here as it allows in-flight data to be cleaned up
+            // properly when stopping early due to errors.
+            if (hts_tpool_dispatch3(pool, queue, thread_ordered_proc, bamdata,
+                                    cleanup_bamstorage, cleanup_bamstorage,
+                                    0) == -1) {
+                fprintf(stderr, "Failed to schedule processing\n");
+                goto end;
+            }
+            bamdata = NULL;
+        } else {
+            fprintf(stderr, "Error in reading data\n");
+            break;
+        }
+    }
+
+    ret = EXIT_SUCCESS;
+
+ end:
+    // Tidy up after having dispatched all of the data.
+
+    // Note that the order here is important.  In particular, we need
+    // to join the thread that was started earlier before freeing anything
+    // to avoid any use-after-free errors.
+
+    // It's also possible to get here early due to various error conditions,
+    // so we need to carefully check which parts of the program state have
+    // been created before trying to clean them up.
+
+    if (queue) {
+        if (-1 == c) {
+            // EOF read, send a marker to tell the threadfn_orderedwrite()
+            // function to shut down.
+            if (hts_tpool_dispatch(pool, queue, thread_ordered_proc,
+                                   NULL) == -1) {
+                fprintf(stderr, "Failed to schedule sentinel job\n");
+                ret = EXIT_FAILURE;
+            }
+        } else {
+            // Error or we never wrote anything.  Shut down the queue to
+            // ensure threadfn_orderedwrite() wakes up and terminates.
+            hts_tpool_process_shutdown(queue);
+        }
+    }
+
+    // Wait for threadfn_orderedwrite to finish.
+    if (started_thread) {
+        pthread_join(thread, NULL);
+
+        // Once the writer thread has finished, check the result it sent back
+        if (twritedata.result != 0) {
+            ret = EXIT_FAILURE;
+        }
+    }
+
+    if (queue) {
+        // Once threadfn_orderedwrite has stopped, the queue can be
+        // cleaned up.
+        hts_tpool_process_destroy(queue);
+    }
+
+    if (in_samhdr) {
+        sam_hdr_destroy(in_samhdr);
+    }
+    if (infile) {
+        if (sam_close(infile) != 0) {
+            ret = EXIT_FAILURE;
+        }
+    }
+    if (outfile) {
+        if (sam_close(outfile) != 0) {
+            ret = EXIT_FAILURE;
+        }
+    }
+
+    pthread_mutex_lock(&bamcache.lock);
+    if (bamcache.list) {
+        struct data *tmp = NULL;
+        while (bamcache.list) {
+            tmp = bamcache.list;
+            bamcache.list = bamcache.list->next;
+            cleanup_bamstorage(tmp);
+        }
+    }
+    pthread_mutex_unlock(&bamcache.lock);
+
+    if (file) {
+        free(file);
+    }
+    if (pool) {
+        hts_tpool_destroy(pool);
+    }
+    return ret;
+}
diff --git a/samples/qtask_unordered.c b/samples/qtask_unordered.c
new file mode 100644
index 000000000..05fe50346
--- /dev/null
+++ b/samples/qtask_unordered.c
@@ -0,0 +1,320 @@
+/*  qtask_ordered.c --  showcases the htslib api usage
+
+    Copyright (C) 2024 Genome Research Ltd.
+
+    Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+
+*/
+
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
+
+#include <getopt.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <htslib/sam.h>
+#include <htslib/thread_pool.h>
+
+struct datacache;
+
+typedef struct basecount {
+    uint64_t counts[16];        //count of all bases
+} basecount;
+
+typedef struct data {
+    int count;                  //used up size
+    int maxsize;                //max size per data chunk
+    bam1_t **bamarray;          //bam1_t array for optimal queueing
+
+    struct datacache *cache;
+    basecount *bases;           //count of all possible bases
+    struct data *next;          //pointer to next one - to reuse earlier allocations
+} data;
+
+typedef struct datacache
+{
+    pthread_mutex_t lock;       //synchronizes the access to cache
+    data *list;                 //data storage
+} datacache;
+
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
+returns nothing
+*/
+static void print_usage(FILE *fp)
+{
+    fprintf(fp, "Usage: qtask_unordered infile threadcount [chunksize]\n\
+Shows the base counts and calculates GC ratio - sum(G,C) / sum(A,T,C,G)\n\
+chunksize [4096] sets the number of alignments clubbed together to process.\n");
+    return;
+}
+
+/// getbamstorage - allocates storage for alignments to queue
+/** @param chunk number of bam data to allocate
+ * @param bases storage of result
+ * @param bamcache cached storage
+returns already allocated data storage if one is available, otherwise allocates new
+*/
+data* getbamstorage(int chunk, basecount *bases, datacache *bamcache)
+{
+    int i = 0;
+    data *bamdata = NULL;
+
+    if (!bamcache || !bases) {
+        return NULL;
+    }
+    //get from cache if there is an already allocated storage
+    if (pthread_mutex_lock(&bamcache->lock)) {
+        return NULL;
+    }
+    if (bamcache->list) {                   //available
+        bamdata = bamcache->list;
+        bamcache->list = bamdata->next;     //remove and set next one as available
+        bamdata->next = NULL;               //remove link
+        bamdata->count = 0;
+
+        bamdata->bases = bases;
+        bamdata->cache = bamcache;
+        goto end;
+    }
+    //allocate and use
+    if (!(bamdata = malloc(sizeof(data)))) {
+        goto end;
+    }
+    bamdata->bamarray = malloc(chunk * sizeof(bam1_t*));
+    if (!bamdata->bamarray) {
+        free(bamdata);
+        bamdata = NULL;
+        goto end;
+    }
+    for (i = 0; i < chunk; ++i) {
+        bamdata->bamarray[i] = bam_init1();
+    }
+    bamdata->maxsize = chunk;
+    bamdata->count = 0;
+    bamdata->next = NULL;
+
+    bamdata->bases = bases;
+    bamdata->cache = bamcache;
+
+end:
+    pthread_mutex_unlock(&bamcache->lock);
+    return bamdata;
+}
+
+/// cleanup_bamstorage - frees a bamdata struct plus contents
+/** @param arg Pointer to data to free
+    @p arg has type void * so it can be used as a callback passed
+    to hts_tpool_dispatch3().
+ */
+void cleanup_bamstorage(void *arg)
+{
+    data *bamdata = (data *) arg;
+    if (!bamdata)
+        return;
+    if (bamdata->bamarray) {
+        int i;
+        for (i = 0; i < bamdata->maxsize; i++) {
+            bam_destroy1(bamdata->bamarray[i]);
+        }
+        free(bamdata->bamarray);
+    }
+    free(bamdata);
+}
+
+/// thread_unordered_proc - does the processing of task in queue and updates result
+/** @param args pointer to set of data to be processed
+returns NULL
+the processing could be in any order based on the number of threads in use
+*/
+void *thread_unordered_proc(void *args)
+{
+    int i = 0;
+    data *bamdata = (data*)args;
+    uint64_t pos = 0;
+    uint8_t *data = NULL;
+    uint64_t counts[16] = {0};
+    for ( i = 0; i < bamdata->count; ++i) {
+        data = bam_get_seq(bamdata->bamarray[i]);
+        for (pos = 0; pos < bamdata->bamarray[i]->core.l_qseq; ++pos) {
+            /* it is faster to count all bases and select required ones later
+            compared to select and count here */
+            counts[bam_seqi(data, pos)]++;
+        }
+    }
+    //update result and add the memory block for reuse
+    pthread_mutex_lock(&bamdata->cache->lock);
+    for (i = 0; i < 16; i++) {
+        bamdata->bases->counts[i] += counts[i];
+    }
+
+    bamdata->next = bamdata->cache->list;
+    bamdata->cache->list = bamdata;
+    pthread_mutex_unlock(&bamdata->cache->lock);
+
+    return NULL;
+}
+
+/// main - start of the demo
+/** @param argc - count of arguments
+ *  @param argv - pointer to array of arguments
+returns 1 on failure 0 on success
+*/
+int main(int argc, char *argv[])
+{
+    const char *inname = NULL;
+    int c = 0, ret = EXIT_FAILURE, cnt = 0, chunk = 0;
+    samFile *infile = NULL;
+    sam_hdr_t *in_samhdr = NULL;
+    hts_tpool *pool = NULL;
+    hts_tpool_process *queue = NULL;
+    htsThreadPool tpool = {NULL, 0};
+    data *bamdata = NULL;
+    basecount gccount = {{0}};
+    datacache bamcache = {PTHREAD_MUTEX_INITIALIZER, NULL};
+
+    //qtask infile threadcount [chunksize]
+    if (argc != 3 && argc != 4) {
+        print_usage(stdout);
+        goto end;
+    }
+    inname = argv[1];
+    cnt = atoi(argv[2]);
+    if (argc == 4) {
+        chunk = atoi(argv[3]);
+    }
+    if (cnt < 1) {
+        cnt = 1;
+    }
+    if (chunk < 1) {
+        chunk = 4096;
+    }
+
+    if (!(pool = hts_tpool_init(cnt))) {
+        fprintf(stderr, "Failed to create thread pool\n");
+        goto end;
+    }
+    tpool.pool = pool;      //to share the pool for file read and write as well
+    //queue to use with thread pool, for tasks
+    if (!(queue = hts_tpool_process_init(pool, cnt * 2, 1))) {
+        fprintf(stderr, "Failed to create queue\n");
+        goto end;
+    }
+    //open input file - r reading
+    if (!(infile = sam_open(inname, "r"))) {
+        fprintf(stderr, "Could not open %s\n", inname);
+        goto end;
+    }
+    //share the thread pool with i/o files
+    if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0) {
+        fprintf(stderr, "Failed to set threads to i/o files\n");
+        goto end;
+    }
+    //read header, required to resolve the target names to proper ids
+    if (!(in_samhdr = sam_hdr_read(infile))) {
+        fprintf(stderr, "Failed to read header from file!\n");
+        goto end;
+    }
+
+    /*tasks are queued, worker threads get them and process in parallel;
+    all bases are counted instead of counting atcg alone as it is faster*/
+
+    c = 0;
+    while (c >= 0) {
+        //use cached storage to avoid allocate/deallocate overheads
+        if (!(bamdata = getbamstorage(chunk, &gccount, &bamcache))) {
+            fprintf(stderr, "Failed to allocate memory\n");
+            break;
+        }
+        //read alignments, upto max size for this lot
+        for (cnt = 0; cnt < bamdata->maxsize; ++cnt) {
+            c = sam_read1(infile, in_samhdr, bamdata->bamarray[cnt]);
+            if (c < 0) {
+                break;      // EOF or failure
+            }
+        }
+        if (c >= -1 ) {
+            //max size data or reached EOF
+            bamdata->count = cnt;
+            // Queue the data for processing.  hts_tpool_dispatch3() is
+            // used here as it allows in-flight data to be cleaned up
+            // properly when stopping early due to errors.
+            if (hts_tpool_dispatch3(pool, queue, thread_unordered_proc, bamdata,
+                                    cleanup_bamstorage, cleanup_bamstorage,
+                                    0) == -1) {
+                fprintf(stderr, "Failed to schedule processing\n");
+                goto end;
+            }
+            bamdata = NULL;
+        } else {
+            fprintf(stderr, "Error in reading data\n");
+            break;
+        }
+    }
+
+     if (-1 == c) {
+        // EOF read, ensure all are processed, waits for all to finish
+        if (hts_tpool_process_flush(queue) == -1) {
+            fprintf(stderr, "Failed to flush queue\n");
+        } else { //all done
+            //refer seq_nt16_str to find position of required bases
+            fprintf(stdout, "GCratio: %f\nBase counts:\n",
+                (gccount.counts[2] /*C*/ + gccount.counts[4] /*G*/) / (float)
+                    (gccount.counts[1] /*A*/ + gccount.counts[8] /*T*/ +
+                        gccount.counts[2] + gccount.counts[4]));
+
+            for (cnt = 0; cnt < 16; ++cnt) {
+                fprintf(stdout, "%c: %"PRIu64"\n", seq_nt16_str[cnt], gccount.counts[cnt]);
+            }
+
+            ret = EXIT_SUCCESS;
+        }
+    }
+ end:
+    if (queue) {
+        hts_tpool_process_destroy(queue);
+    }
+
+    if (in_samhdr) {
+        sam_hdr_destroy(in_samhdr);
+    }
+    if (infile) {
+        if (sam_close(infile) != 0) {
+            ret = EXIT_FAILURE;
+        }
+    }
+
+    pthread_mutex_lock(&bamcache.lock);
+    if (bamcache.list) {
+        struct data *tmp = NULL;
+        while (bamcache.list) {
+            tmp = bamcache.list;
+            bamcache.list = bamcache.list->next;
+            cleanup_bamstorage(tmp);
+        }
+    }
+    pthread_mutex_unlock(&bamcache.lock);
+
+    if (pool) {
+        hts_tpool_destroy(pool);
+    }
+    return ret;
+}
diff --git a/samples/read_aux.c b/samples/read_aux.c
index cbf972b98..efd6f3651 100644
--- a/samples/read_aux.c
+++ b/samples/read_aux.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
@@ -92,7 +92,7 @@ int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data)
         fprintf(fp, "%c", auxBType);
         for (i = 0; i < auxBcnt; ++i) {                                                     //iterate the array
             fprintf(fp, ",");
-            //calling recurssively  with index to reuse a few lines
+            //calling recursively  with index to reuse a few lines
             if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) {
                 return EXIT_FAILURE;
             }
@@ -166,7 +166,7 @@ int main(int argc, char *argv[])
         else {
             //option 2 - get raw data
             if (!(data = bam_aux_get(bamdata, tag))) {
-                //tag data not returned, errono gives the reason
+                //tag data not returned, errno gives the reason
                 if (errno == ENOENT) {
                     printf("Tag not present\n");
                 }
diff --git a/samples/read_bam.c b/samples/read_bam.c
index 7fca8c55d..30bedf81c 100644
--- a/samples/read_bam.c
+++ b/samples/read_bam.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/read_fast.c b/samples/read_fast.c
index f74b25515..10f807b69 100644
--- a/samples/read_fast.c
+++ b/samples/read_fast.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
@@ -83,6 +83,8 @@ int main(int argc, char *argv[])
 
     //read data
     while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) {
+        printf("\nname: ");
+        printf("%s", bam_get_qname(bamdata));
         printf("\nsequence: ");
         for (c = 0; c < bamdata->core.l_qseq; ++c) {
             printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]);
@@ -90,10 +92,11 @@ int main(int argc, char *argv[])
         if (infile->format.format == fastq_format) {
             printf("\nquality: ");
             for (c = 0; c < bamdata->core.l_qseq; ++c) {
-                printf("%c", bam_get_qual(bamdata)[c]);
+                printf("%c", bam_get_qual(bamdata)[c] + 33);
             }
         }
     }
+    printf("\n");
     if (c != -1) {
         //error
         printf("Failed to get data\n");
diff --git a/samples/read_fast_index.c b/samples/read_fast_index.c
new file mode 100644
index 000000000..97076630a
--- /dev/null
+++ b/samples/read_fast_index.c
@@ -0,0 +1,163 @@
+/*  read_fast_index.c --  showcases the htslib api usage
+
+    Copyright (C) 2023 Genome Research Ltd.
+
+    Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+
+*/
+
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
+
+#include <getopt.h>
+#include <unistd.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
+returns nothing
+*/
+static void print_usage(FILE *fp)
+{
+    fprintf(fp, "Usage: read_fast_i <infile> A/Q 0/1 regiondef\n\
+Reads the fasta/fastq file using index and shows the content.\n\
+For fasta files use A and Q for fastq files.\n\
+Region can be 1 or more of <reference name>[:start-end] entries separated by comma.\n\
+For single region, give regcount as 0 and non 0 for multi-regions.\n");
+    return;
+}
+
+/// main_demo - start of the demo
+/** @param argc - count of arguments
+ *  @param argv - pointer to array of arguments
+returns 1 on failure 0 on success
+*/
+int main(int argc, char *argv[])
+{
+    const char *inname = NULL, *region = NULL, *data = NULL, *remaining = NULL;
+    int ret = EXIT_FAILURE, tid = -1, usemulti = 0;
+    faidx_t *idx = NULL;
+    enum fai_format_options fmt = FAI_FASTA;
+    hts_pos_t len = 0, beg = 0, end = 0;
+
+    //read_fast_i infile A/Q regcount region
+    if (argc != 5) {
+        print_usage(stdout);
+        goto end;
+    }
+    inname = argv[1];
+    if (argv[2][0] == 'Q') {
+        fmt = FAI_FASTQ;
+    }
+    usemulti = atoi(argv[3]);
+    region = argv[4];
+
+    //load index
+    if (!(idx = fai_load3_format(inname, NULL, NULL, FAI_CREATE, fmt))) {
+        printf("Failed to load index\n");
+        goto end;
+    }
+
+    if (!usemulti) {
+        //get data from given region
+        if (!(data = fai_fetch64(idx, region, &len))) {
+            if (-1 == len) {
+                printf("Failed to get data\n");                 //failure
+                goto end;
+            }
+            else {
+                printf("Data not found for given region\n");    //no data
+            }
+        }
+        else {
+            printf("Data: %"PRId64" %s\n", len, data);
+            free((void*)data);
+            //get quality for fastq type
+            if (fmt == FAI_FASTQ) {
+                if (!(data = fai_fetchqual64(idx, region, &len))) {
+                    if (len == -1) {
+                        printf("Failed to get data\n");
+                        goto end;
+                    }
+                    else {
+                        printf("Data not found for given region\n");
+                    }
+                }
+                else {
+                    printf("Qual: %"PRId64" %s\n", len, data);
+                    free((void*)data);
+                }
+            }
+        }
+    }
+    else {
+        //parse, get each region and get data for each
+        while ((remaining = fai_parse_region(idx, region, &tid, &beg, &end, HTS_PARSE_LIST))) {     //here expects regions as csv
+            //parsed the region, correct end points based on actual data
+            if (fai_adjust_region(idx, tid, &beg, &end) == -1) {
+                printf("Error in adjusting region for tid %d\n", tid);
+                goto end;
+            }
+            //get data for given region
+            if (!(data = faidx_fetch_seq64(idx, faidx_iseq(idx, tid), beg, end, &len))) {
+                if (len == -1) {
+                    printf("Failed to get data\n");                 //failure
+                    goto end;
+                }
+                else {
+                    printf("No data found for given region\n");     //no data
+                }
+            }
+            else {
+                printf("Data: %"PRIhts_pos" %s\n", len, data);
+                free((void*)data);
+                data = NULL;
+
+                //get quality data for fastq
+                if (fmt == FAI_FASTQ) {
+                    if (!(data = faidx_fetch_qual64(idx, faidx_iseq(idx, tid), beg, end, &len))) {
+                        if (len == -1) {
+                            printf("Failed to get qual data\n");
+                            goto end;
+                        }
+                        else {
+                            printf("No data found for given region\n");
+                        }
+                    }
+                    else {
+                        printf("Qual: %"PRIhts_pos" %s\n", len, data);
+                        free((void*)data);
+                        data = NULL;
+                    }
+                }
+            }
+            region = remaining;                                     //parse remaining region defs
+        }
+    }
+
+    ret = EXIT_SUCCESS;
+end:
+    //clean up
+    if (idx) {
+        fai_destroy(idx);
+    }
+    return ret;
+}
diff --git a/samples/read_header.c b/samples/read_header.c
index eb14daea5..54b07e736 100644
--- a/samples/read_header.c
+++ b/samples/read_header.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which susage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/read_refname.c b/samples/read_refname.c
index adbc71183..9b4918ded 100644
--- a/samples/read_refname.c
+++ b/samples/read_refname.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/rem_header.c b/samples/rem_header.c
index a0b6510fb..852d5f055 100644
--- a/samples/rem_header.c
+++ b/samples/rem_header.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
@@ -124,7 +124,7 @@ int main(int argc, char *argv[])
     ret = EXIT_SUCCESS;
     //bam data write to follow....
 end:
-    //cleanupq
+    //cleanup
     if (in_samhdr) {
         sam_hdr_destroy(in_samhdr);
     }
diff --git a/samples/sample.bed b/samples/sample.bed
new file mode 100644
index 000000000..2ae458fd5
--- /dev/null
+++ b/samples/sample.bed
@@ -0,0 +1,4 @@
+T1	1	2
+T1	30	35
+T2	10	15
+T2	30	40
diff --git a/samples/sample.ref.fq b/samples/sample.ref.fq
new file mode 100644
index 000000000..18b2b9617
--- /dev/null
+++ b/samples/sample.ref.fq
@@ -0,0 +1,16 @@
+@T1
+AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT
++
+AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT
+@T2
+TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT
++
+TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT
+@T3
+TTTTGGGGACTGTTAACAGT
++
+TTTTGGGGACTGTTAACAGT
+@T4
+TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTGGGGACTGTTAACAGT
++
+TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTTGGGGACTGTTAACAGT
diff --git a/samples/sample.sam b/samples/sample.sam
index e56efd69f..58515c976 100644
--- a/samples/sample.sam
+++ b/samples/sample.sam
@@ -9,7 +9,7 @@
 @CO	1234567890123456789012345678901234567890
 @CO	AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT	T1
 @CO	TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT	T2
-@CO	ITR1-ITR2M, ITR2-ITR2M are proper pairs in T1 and T2, UNMP1 is partly mapped and pair is unmapped, UNMP2 & 3 are unmappped
+@CO	ITR1-ITR2M, ITR2-ITR2M are proper pairs in T1 and T2, UNMP1 is partly mapped and pair is unmapped, UNMP2 & 3 are unmapped
 @CO	A1-A2, A4-A3 are proper pairs with A4-A3 in different read order. A5 is secondary alignment
 ITR1	99	T1	5	40	4M	=	33	10	ACTG	()()
 ITR2	147	T2	23	49	2M	=	35	-10	TT	**
diff --git a/samples/split.c b/samples/split.c
index 2eb9e6b79..c51dbd385 100644
--- a/samples/split.c
+++ b/samples/split.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/split2.c b/samples/split2.c
index 2354abfe3..33fabbd67 100644
--- a/samples/split2.c
+++ b/samples/split2.c
@@ -24,19 +24,19 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
 {
-    fprintf(fp, "Usage: split infile outdir\n\
+    fprintf(fp, "Usage: split2 infile outdir\n\
 Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\
 Shows file type selection through name and format api\n");
     return;
diff --git a/samples/split_thread1.c b/samples/split_thread1.c
index 40d2dfdc2..551c7f093 100644
--- a/samples/split_thread1.c
+++ b/samples/split_thread1.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
@@ -94,9 +94,9 @@ int main(int argc, char *argv[])
     }
 
     //create file specific threads
-    if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 ||     //2 thread specific for reading
+    if (hts_set_opt(infile, HTS_OPT_NTHREADS, 1) < 0 ||     //1 thread specific for reading
     hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 ||       //1 thread specific for sam write
-    hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) {       //1 thread specific for bam write
+    hts_set_opt(outfile2, HTS_OPT_NTHREADS, 2) < 0) {       //2 thread specific for bam write
         printf("Failed to set thread options\n");
         goto end;
     }
diff --git a/samples/split_thread2.c b/samples/split_thread2.c
index dab897b5f..dc8bc9f31 100644
--- a/samples/split_thread2.c
+++ b/samples/split_thread2.c
@@ -24,15 +24,15 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 #include <htslib/thread_pool.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/update_header.c b/samples/update_header.c
index f6b1680cd..237d5c4df 100644
--- a/samples/update_header.c
+++ b/samples/update_header.c
@@ -24,14 +24,14 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
 #include <htslib/sam.h>
 
-/// print_usage - print the demo_usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
diff --git a/samples/write_fast.c b/samples/write_fast.c
index ef7817683..626c693f6 100644
--- a/samples/write_fast.c
+++ b/samples/write_fast.c
@@ -24,19 +24,21 @@ DEALINGS IN THE SOFTWARE
 
 */
 
-/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */
+/* The purpose of this code is to demonstrate the library apis and need proper error handling and optimisation */
 
 #include <getopt.h>
 #include <unistd.h>
+#include <time.h>
 #include <htslib/sam.h>
+#include <htslib/faidx.h>
 
-/// print_usage - show flags_demo usage
-/** @param fp pointer to the file / terminal to which demo_usage to be dumped
+/// print_usage - show usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
 returns nothing
 */
 static void print_usage(FILE *fp)
 {
-    fprintf(fp, "Usage: write_fast <file>\n\
+    fprintf(fp, "Usage: write_fast <file> <sequence> [<qualities]\n\
 Appends a fasta/fastq file.\n");
     return;
 }
@@ -54,12 +56,22 @@ int main(int argc, char *argv[])
     sam_hdr_t *out_samhdr = NULL;           //header of file
     bam1_t *bamdata = NULL;                 //to hold the read data
     char mode[4] = "a";
+    const char *data = NULL, *qual = NULL;  //ref data and quality
+    char name[256] = {0};
 
-    if (argc != 2) {
+    if (argc > 4 || argc < 3) {
         print_usage(stdout);
         goto end;
     }
     outname = argv[1];
+    data = argv[2];
+    if (argc == 4) {    //fastq data
+        qual = argv[3];
+        if (strlen(data) != strlen(qual)) {     //check for proper length of data and quality values
+            printf("Incorrect reference and quality data\n");
+            goto end;
+        }
+    }
 
     //initialize
     if (!(bamdata = bam_init1())) {
@@ -71,12 +83,19 @@ int main(int argc, char *argv[])
         goto end;
     }
     //open output file
-    if (!(outfile = sam_open(outname, mode))) {
+    if (!(outfile = sam_open(outname, mode))) {         //expects the name to have correct extension!
         printf("Could not open %s\n", outname);
         goto end;
     }
-    //dummy data
-    if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0) < 0) {
+    /* if the file name extension is not appropriate to the content, inconsistent data will be present in output.
+    if required, htsFormat and sam_open_format can be explicitly used to ensure appropriateness of content.
+    htsFormat fmt = {sequence_data, fastq_format / fasta_format};
+    sam_open_format(outname, mode, fmt);
+    */
+
+    snprintf(name, sizeof(name), "Test_%ld", time(NULL));
+    //data
+    if (bam_set1(bamdata, strlen(name), name, BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, strlen(data), data, qual, 0) < 0) {
         printf("Failed to set data\n");
         goto end;
     }
@@ -84,7 +103,6 @@ int main(int argc, char *argv[])
         printf("Failed to write data\n");
         goto end;
     }
-
     ret = EXIT_SUCCESS;
 end:
     //clean up

From b098572773a6a87f19d38ce35759e93ad7c6e0d2 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 16 Jul 2024 12:25:59 +0100
Subject: [PATCH 50/76] Adjust GitHub actions windows build package list

Use the mingw-w64-x86_64 flavour of all packages.  Add curl, so
hfile_libcurl etc. build.  Remove the mingw git package as git
for windows is already present.  Add the location of git for
windows to $PATH (note this must be at the end, else the wrong
shell gets picked up when running tests leading to backslash
conversion issues).

Switch the badge on README.md from AppVeyor to GitHub.
---
 .github/workflows/windows-build.yml | 16 ++++++++--------
 README.md                           |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index c9c18461b..3d818318c 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -15,18 +15,18 @@ jobs:
         msystem: mingw64
         update: false
         install: >-
-          git
-          zlib-devel
-          libbz2-devel
-          liblzma-devel
-          mingw-w64-x86_64-toolchain
           mingw-w64-x86_64-autotools
-          mingw-w64-x86_64-tools-git
+          mingw-w64-x86_64-bzip2
+          mingw-w64-x86_64-curl
           mingw-w64-x86_64-libdeflate
+          mingw-w64-x86_64-toolchain
+          mingw-w64-x86_64-tools-git
+          mingw-w64-x86_64-xz
+          mingw-w64-x86_64-zlib
     - name: Compile htslib
       shell: msys2 {0}
       run: |
-        export PATH=/mingw64/bin:$PATH
+        export PATH="/mingw64/bin:$PATH:/c/Program Files/Git/bin"
         export MSYSTEM=MINGW64
         autoreconf -i
         ./configure
@@ -34,7 +34,7 @@ jobs:
     - name: Check Htslib
       shell: msys2 {0}
       run: |
-        export PATH=/mingw64/bin:$PATH
+        export PATH="/mingw64/bin:$PATH:/c/Program Files/Git/bin"
         export MSYSTEM=MINGW64
         make test-shlib-exports && make check
 
diff --git a/README.md b/README.md
index 47afdba2a..0d1d85973 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 [![Build Status](https://api.cirrus-ci.com/github/samtools/htslib.svg?branch=develop)](https://api.cirrus-ci.com/github/samtools/htslib)
-[![Build status](https://ci.appveyor.com/api/projects/status/v46hkwyfjp3l8nd3/branch/develop?svg=true)](https://ci.appveyor.com/project/samtools/htslib/branch/develop)
+[![Build status](https://github.com/samtools/htslib/actions/workflows/windows-build.yml/badge.svg)](https://github.com/samtools/htslib/actions/workflows/windows-build.yml?query=branch%3Adevelop)
 [![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib)
 
 HTSlib is an implementation of a unified C library for accessing common file

From a314b21dc1fa01845232b26b976e921f76186e7f Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 18 Jul 2024 11:58:02 +0100
Subject: [PATCH 51/76] Remove undefined behavior in expression language.

The bit-wise AND, OR and XOR logic works on the overloaded double
(res->d), but casting this to int64_t for NAN is undefined behaviour.

Infact the current cast doesn't even yield the correct bit-wise layout
for NAN, as (long)d and *(long *)&d give different results for d==NAN.
However the result wasn't used anyway as it's promptly cleared again
in the subsequent `if (undef) hts_expr_val_undef(res)` code.  Hence
undefined or otherwise, it made no difference.
---
 hts_expr.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/hts_expr.c b/hts_expr.c
index 5e5a132ea..0fdb3bc8b 100644
--- a/hts_expr.c
+++ b/hts_expr.c
@@ -527,8 +527,10 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
+            } else {
+                res->is_true =
+                    (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0;
             }
-            res->is_true = (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0;
         } else {
             break;
         }
@@ -560,8 +562,10 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
+            } else {
+                res->is_true =
+                    (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0;
             }
-            res->is_true = (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0;
         } else {
             break;
         }
@@ -593,8 +597,10 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
             } else if (res->is_str || val.is_str) {
                 hts_expr_val_free(&val);
                 return -1;
+            } else {
+                res->is_true =
+                    (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0;
             }
-            res->is_true = (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0;
         } else {
             break;
         }

From 25d03c68d81c1b02fec606c0dc6591b1b582ebae Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 18 Jul 2024 12:12:34 +0100
Subject: [PATCH 52/76] Fix signed overflow for hts_parse_decimal.

Obviously there can still be overflows if we attempt to parse numbers
which are too big, but for the legally accepted range of this, parsing
"-9,223,372,036,854,775,808" as tested in test/sam.c triggered a
problem as the positive version doesn't fit in "long long".

We parse as unsigned and only switch to signed via the implicit return
type conversion (and probably exploiting twos-complement, but that's a
fair assumption).
---
 hts.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hts.c b/hts.c
index caf85e64a..42be3f21f 100644
--- a/hts.c
+++ b/hts.c
@@ -3826,7 +3826,7 @@ void hts_itr_destroy(hts_itr_t *iter)
     }
 }
 
-static inline long long push_digit(long long i, char c)
+static inline unsigned long long push_digit(unsigned long long i, char c)
 {
     // ensure subtraction occurs first, avoiding overflow for >= MAX-48 or so
     int digit = c - '0';
@@ -3835,7 +3835,7 @@ static inline long long push_digit(long long i, char c)
 
 long long hts_parse_decimal(const char *str, char **strend, int flags)
 {
-    long long n = 0;
+    unsigned long long n = 0;
     int digits = 0, decimals = 0, e = 0, lost = 0;
     char sign = '+', esign = '+';
     const char *s, *str_orig = str;

From fbe5ff6c52544ee776a5d2ec0176c61ab103cce5 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 18 Jul 2024 12:28:40 +0100
Subject: [PATCH 53/76] Fix an undefined addition to a NULL pointer in
 vcf_format.

The pointer was never used, but the NULL+0 still triggers clang's ubsan.
---
 vcf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vcf.c b/vcf.c
index 53f2b7a92..daedad34d 100644
--- a/vcf.c
+++ b/vcf.c
@@ -4020,7 +4020,10 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
 
     kputc_('\t', s); // INFO
     if (v->n_info) {
-        uint8_t *ptr = (uint8_t *)v->shared.s + v->unpack_size[0] + v->unpack_size[1] + v->unpack_size[2];
+        uint8_t *ptr = v->shared.s
+            ? (uint8_t *)v->shared.s + v->unpack_size[0] +
+               v->unpack_size[1] + v->unpack_size[2]
+            : NULL;
         int first = 1;
         bcf_info_t *info = v->d.info;
 

From 94777caeabf04ea49584a2db728064e62a805140 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 29 Jul 2024 21:17:48 +0100
Subject: [PATCH 54/76] Stricter limit on POS/MPOS/TLEN in sam_parse1()

Help avoid overflow on arithmetic involving POS, MPOS and TLEN
by limiting values in the SAM parser to fit in 62 bits (or 63
for TLEN as it's signed).  The new limit is still massively bigger
than any known reference so it should not cause any problems
in practice.

Credit to OSS-Fuzz
Fixes oss-fuzz 68750
---
 sam.c      |  7 ++++---
 test/sam.c | 18 +++++++++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/sam.c b/sam.c
index e39ec3f85..7e58da6e7 100644
--- a/sam.c
+++ b/sam.c
@@ -2947,7 +2947,7 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
     } else c->tid = -1;
 
     // pos
-    c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
+    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
     if (*p++ != '\t') goto err_ret;
     if (c->pos < 0 && c->tid >= 0) {
         _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
@@ -2990,15 +2990,16 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
         _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
     }
     // mpos
-    c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
+    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
     if (*p++ != '\t') goto err_ret;
     if (c->mpos < 0 && c->mtid >= 0) {
         _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
         c->mtid = -1;
     }
     // tlen
-    c->isize = hts_str2int(p, &p, 64, &overflow);
+    c->isize = hts_str2int(p, &p, 63, &overflow);
     if (*p++ != '\t') goto err_ret;
+    _parse_err(overflow, "number outside allowed range");
     // seq
     q = _read_token(p);
     if (strcmp(q, "*")) {
diff --git a/test/sam.c b/test/sam.c
index 09e4aecf5..74591fc2d 100644
--- a/test/sam.c
+++ b/test/sam.c
@@ -1,6 +1,6 @@
 /*  test/sam.c -- SAM/BAM/CRAM API test cases.
 
-    Copyright (C) 2014-2020, 2022-2023 Genome Research Ltd.
+    Copyright (C) 2014-2020, 2022-2024 Genome Research Ltd.
 
     Author: John Marshall <jm18@sanger.ac.uk>
 
@@ -1408,16 +1408,16 @@ static void check_big_ref(int parse_header)
         "@HD\tVN:1.4\n"
         "@SQ\tSN:large#1\tLN:5000000000\n"
         "@SQ\tSN:small#1\tLN:100\n"
-        "@SQ\tSN:large#2\tLN:9223372034707292158\n"
+        "@SQ\tSN:large#2\tLN:4611686018427387904\n"
         "@SQ\tSN:small#2\tLN:1\n"
         "r1\t0\tlarge#1\t4999999000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"
         "r2\t0\tsmall#1\t1\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"
-        "r3\t0\tlarge#2\t9223372034707292000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"
-        "p1\t99\tlarge#2\t1\t50\t8M\t=\t9223372034707292150\t9223372034707292158\tACGTACGT\tabcdefgh\n"
-        "p1\t147\tlarge#2\t9223372034707292150\t50\t8M\t=\t1\t-9223372034707292158\tACGTACGT\tabcdefgh\n"
+        "r3\t0\tlarge#2\t4611686018427387000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"
+        "p1\t99\tlarge#2\t1\t50\t8M\t=\t4611686018427387895\t4611686018427387903\tACGTACGT\tabcdefgh\n"
+        "p1\t147\tlarge#2\t4611686018427387895\t50\t8M\t=\t1\t-4611686018427387903\tACGTACGT\tabcdefgh\n"
         "r4\t0\tsmall#2\t2\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n";
     const hts_pos_t expected_lengths[] = {
-        5000000000LL, 100LL, 9223372034707292158LL, 1LL
+        5000000000LL, 100LL, 4611686018427387904LL, 1LL
     };
     const int expected_tids[] = {
         0, 1, 2, 2, 2, 3
@@ -1426,11 +1426,11 @@ static void check_big_ref(int parse_header)
         -1, -1, -1, 2, 2, -1
     };
     const hts_pos_t expected_positions[] = {
-        4999999000LL - 1, 1LL - 1, 9223372034707292000LL - 1, 1LL - 1,
-        9223372034707292150LL - 1, 2LL - 1
+        4999999000LL - 1, 1LL - 1, 4611686018427387000LL - 1, 1LL - 1,
+        4611686018427387895LL - 1, 2LL - 1
     };
     const hts_pos_t expected_mpos[] = {
-        -1, -1, -1, 9223372034707292150LL - 1, 1LL - 1, -1
+        -1, -1, -1, 4611686018427387895LL - 1, 1LL - 1, -1
     };
     samFile *in = NULL, *out = NULL;
     sam_hdr_t *header = NULL, *dup_header = NULL;

From 278e23d9f9abc10c44a244854f69c04f1e71c606 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Tue, 30 Jul 2024 12:49:35 +0100
Subject: [PATCH 55/76] Stricter limit on POS in vcf_parse()

Limiting POS to 62 bits helps avoid the risk of signed overflow
when it's set to a very extreme value.  The maximum is still
much higher than the length of the longest currently known
reference.
---
 vcf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vcf.c b/vcf.c
index daedad34d..7ce306f92 100644
--- a/vcf.c
+++ b/vcf.c
@@ -3703,7 +3703,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
 
     overflow = 0;
     char *tmp = p;
-    v->pos = hts_str2uint(p, &p, 63, &overflow);
+    v->pos = hts_str2uint(p, &p, 62, &overflow);
     if (overflow) {
         hts_log_error("Position value '%s' is too large", tmp);
         goto err;

From f1a7ec90fa5241e4626d490aec82c78b1122ba64 Mon Sep 17 00:00:00 2001
From: Petr Danecek <pd3@sanger.ac.uk>
Date: Tue, 23 Jul 2024 14:26:34 +0100
Subject: [PATCH 56/76] Extend the -O, --overlap option

to choose the denominator with respect to the SRC or TGT.
The extension is backward compatible.
---
 annot-tsv.1                 |  8 +++---
 annot-tsv.c                 | 50 ++++++++++++++++++++++++++-----------
 test/annot-tsv/out.13.1.txt |  2 ++
 test/annot-tsv/out.13.2.txt |  2 ++
 test/annot-tsv/out.13.3.txt |  2 ++
 test/annot-tsv/out.13.4.txt |  2 ++
 test/annot-tsv/src.13.txt   |  2 ++
 test/test.pl                |  6 +++++
 8 files changed, 57 insertions(+), 17 deletions(-)
 create mode 100644 test/annot-tsv/out.13.1.txt
 create mode 100644 test/annot-tsv/out.13.2.txt
 create mode 100644 test/annot-tsv/out.13.3.txt
 create mode 100644 test/annot-tsv/out.13.4.txt
 create mode 100644 test/annot-tsv/src.13.txt

diff --git a/annot-tsv.1 b/annot-tsv.1
index 34e1dd617..fcdec29aa 100644
--- a/annot-tsv.1
+++ b/annot-tsv.1
@@ -170,13 +170,15 @@ Ignore the headers completely and use numeric indexes even when a header exists
 Suppress index numbers in the printed header. If given twice, drop the entire header.
 .RE
 .PP
-.BR \-O ", " \-\-overlap " FLOAT"
+.BR \-O ", " \-\-overlap " FLOAT,[FLOAT]"
 .RS 4
-Minimum overlap as a fraction of region length in at least one of the overlapping regions. If also
+Minimum overlap as a fraction of region length in SRC and TGT, respectively (with two numbers), or in
+at least one of the overlapping regions (with a single number). If also
 .BR \-r ", " \-\-reciprocal
 is given, require at least
 .I FLOAT
-overlap with respect to both regions
+overlap with respect to both regions. Two identical numbers are equivalent to running with
+.BR \-r ", " \-\-reciprocal
 .RE
 .PP
 .BR \-r ", " \-\-reciprocal
diff --git a/annot-tsv.c b/annot-tsv.c
index e453ede5b..494c43744 100644
--- a/annot-tsv.c
+++ b/annot-tsv.c
@@ -105,8 +105,8 @@ typedef struct
     char *core_str, *match_str, *transfer_str, *annots_str, *headers_str, *delim_str;
     char *temp_dir, *out_fname;
     BGZF *out_fp;
-    int allow_dups, reciprocal, max_annots, mode, no_write_hdr;
-    double overlap;
+    int allow_dups, max_annots, mode, no_write_hdr, overlap_either;
+    double overlap_src, overlap_dst;
     regidx_t *idx;
     regitr_t *itr;
     kstring_t tmp_kstr;
@@ -736,18 +736,20 @@ void process_line(args_t *args, char *line, size_t size)
     int has_match = 0, annot_len = 0;
     while ( regitr_overlap(args->itr) )
     {
-        if ( args->overlap )
+        if ( args->overlap_src || args->overlap_dst )
         {
-            double len1 = end - beg + 1;
-            double len2 = args->itr->end - args->itr->beg + 1;
+            double len_dst = end - beg + 1;
+            double len_src = args->itr->end - args->itr->beg + 1;
             double isec = (args->itr->end < end ? args->itr->end : end) - (args->itr->beg > beg ? args->itr->beg : beg) + 1;
-            if ( args->reciprocal )
+            int pass_dst = isec/len_dst < args->overlap_dst ? 0 : 1;
+            int pass_src = isec/len_src < args->overlap_src ? 0 : 1;
+            if ( args->overlap_either )
             {
-                if ( isec/len1 < args->overlap || isec/len2 < args->overlap ) continue;
+                if ( !pass_dst && !pass_src ) continue;
             }
             else
             {
-                if ( isec/len1 < args->overlap && isec/len2 < args->overlap ) continue;
+                if ( !pass_dst || !pass_src ) continue;
             }
         }
         cols_t *src_cols = regitr_payload(args->itr,cols_t*);
@@ -885,8 +887,9 @@ static const char *usage_text(void)
         "   -H, --ignore-headers    Use numeric indices, ignore the headers completely\n"
         "   -I, --no-header-idx     Suppress index numbers in the printed header. If given\n"
         "                           twice, drop the entire header\n"
-        "   -O, --overlap FLOAT     Minimum required overlap (non-reciprocal, unless -r\n"
-        "                           is given)\n"
+        "   -O, --overlap FLOAT[,FLOAT]     Minimum required overlap with respect to SRC,TGT.\n"
+        "                           If single value, the bigger overlap is considered.\n"
+        "                           Identical values are equivalent to running with -r.\n"
         "   -r, --reciprocal        Apply the -O requirement to both overlapping\n"
         "                           intervals\n"
         "   -x, --drop-overlaps     Drop overlapping regions (precludes -f)\n"
@@ -941,6 +944,7 @@ int main(int argc, char **argv)
     };
     char *tmp = NULL;
     int c;
+    int reciprocal = 0;
     while ((c = getopt_long(argc, argv, "c:f:m:o:s:t:a:HO:rxh:Id:",loptions,NULL)) >= 0)
     {
         switch (c)
@@ -960,16 +964,24 @@ int main(int argc, char **argv)
             case 'd': args->delim_str = optarg; break;
             case 'h': args->headers_str = optarg; break;
             case 'H': args->headers_str = "0:0"; break;
-            case 'r': args->reciprocal = 1; break;
+            case 'r': reciprocal = 1; break;
             case 'c': args->core_str  = optarg; break;
             case 't': args->dst.fname = optarg; break;
             case 'm': args->match_str = optarg; break;
             case 'a': args->annots_str = optarg; break;
             case 'o': args->out_fname = optarg; break;
             case 'O':
-                args->overlap = strtod(optarg, &tmp);
-                if ( tmp==optarg || *tmp ) error("Could not parse --overlap %s\n", optarg);
-                if ( args->overlap<0 || args->overlap>1 ) error("Expected value from the interval [0,1]: --overlap %s\n", optarg);
+                args->overlap_src = strtod(optarg, &tmp);
+                if ( tmp==optarg || (*tmp && *tmp!=',') ) error("Could not parse --overlap %s\n", optarg);
+                if ( args->overlap_src<0 || args->overlap_src>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg);
+                if ( *tmp )
+                {
+                    args->overlap_dst = strtod(tmp+1, &tmp);
+                    if ( *tmp ) error("Could not parse --overlap %s\n", optarg);
+                    if ( args->overlap_dst<0 || args->overlap_dst>1 ) error("Expected value(s) from the interval [0,1]: --overlap %s\n", optarg);
+                }
+                else
+                    args->overlap_either = 1;
                 break;
             case 's': args->src.fname = optarg; break;
             case 'f': args->transfer_str = optarg; break;
@@ -994,6 +1006,16 @@ int main(int argc, char **argv)
         else args->mode = PRINT_MATCHING|PRINT_NONMATCHING;
     }
     if ( (args->transfer_str || args->annots_str) && !(args->mode & PRINT_MATCHING) ) error("The option -x cannot be combined with -f and -a\n");
+    if ( reciprocal )
+    {
+        if ( args->overlap_dst && args->overlap_src && args->overlap_dst!=args->overlap_src )
+            error("The combination of --reciprocal with --overlap %f,%f makes no sense: expected single value or identical values\n",args->overlap_src,args->overlap_dst);
+        if ( !args->overlap_src )
+            args->overlap_src = args->overlap_dst;
+        else
+            args->overlap_dst = args->overlap_src;
+        args->overlap_either = 0;
+    }
 
     init_data(args);
     write_header(args, &args->dst);
diff --git a/test/annot-tsv/out.13.1.txt b/test/annot-tsv/out.13.1.txt
new file mode 100644
index 000000000..a1bf0be68
--- /dev/null
+++ b/test/annot-tsv/out.13.1.txt
@@ -0,0 +1,2 @@
+1	10	20	long	long,short
+1	15	15	short	long,short
diff --git a/test/annot-tsv/out.13.2.txt b/test/annot-tsv/out.13.2.txt
new file mode 100644
index 000000000..7c543b134
--- /dev/null
+++ b/test/annot-tsv/out.13.2.txt
@@ -0,0 +1,2 @@
+1	10	20	long	long
+1	15	15	short	short
diff --git a/test/annot-tsv/out.13.3.txt b/test/annot-tsv/out.13.3.txt
new file mode 100644
index 000000000..8911afad8
--- /dev/null
+++ b/test/annot-tsv/out.13.3.txt
@@ -0,0 +1,2 @@
+1	10	20	long	long
+1	15	15	short	long,short
diff --git a/test/annot-tsv/out.13.4.txt b/test/annot-tsv/out.13.4.txt
new file mode 100644
index 000000000..f7a0e4d88
--- /dev/null
+++ b/test/annot-tsv/out.13.4.txt
@@ -0,0 +1,2 @@
+1	10	20	long	long,short
+1	15	15	short	short
diff --git a/test/annot-tsv/src.13.txt b/test/annot-tsv/src.13.txt
new file mode 100644
index 000000000..de3338de1
--- /dev/null
+++ b/test/annot-tsv/src.13.txt
@@ -0,0 +1,2 @@
+1	10	20	long
+1	15	15	short
diff --git a/test/test.pl b/test/test.pl
index ef6a56612..b5f52bdfb 100755
--- a/test/test.pl
+++ b/test/test.pl
@@ -1472,4 +1472,10 @@ sub test_annot_tsv
     run_annot_tsv($opts,src=>'src.11.txt',dst=>'dst.11.txt',out=>'out.11.3.txt',args=>'-c chr2,beg2,end2:chr,beg,end -f smpl2:src_smpl -h 3:2 -I');
     run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.12.txt',out=>'out.12.1.txt',args=>'-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d ,');
     run_annot_tsv($opts,src=>'src.12.txt',dst=>'dst.11.txt',out=>'out.11.1.txt',args=>q[-c 1,2,3:1,2,3 -f 4:5 -h 0:0 -d $',:\t']);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5]);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.1.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5]);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5 -r]);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5,0.5]);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.3.txt',args=>q[-c 1,2,3 -f 4:5 -O 0,1]);
+    run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.4.txt',args=>q[-c 1,2,3 -f 4:5 -O 1,0]);
 }

From 2202fee19b19d2b34026bf2de6a39ed4ce4bc278 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 6 Aug 2024 14:47:07 +0100
Subject: [PATCH 57/76] Protect against mapped CRAM records at POS 0.

We check pos >= ref_len, but didn't check for pos 0 (aka -1 in BAM).

Credit to OSS-Fuzz
Fixes oss-fuzz 70917
---
 cram/cram_encode.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cram/cram_encode.c b/cram/cram_encode.c
index d59ea5253..5d22db54d 100644
--- a/cram/cram_encode.c
+++ b/cram/cram_encode.c
@@ -3441,6 +3441,11 @@ static int process_one_read(cram_fd *fd, cram_container *c,
         int64_t apos = cr->apos-1, spos = 0;
         int64_t MD_last = apos; // last position of edit in MD tag
 
+        if (apos < 0) {
+            hts_log_error("Mapped read with position <= 0 is disallowed");
+            return -1;
+        }
+
         cr->cigar       = s->ncigar;
         cr->ncigar      = bam_cigar_len(b);
         while (cr->cigar + cr->ncigar >= s->cigar_alloc) {

From 2e32e7f75511d46b3088c5351a7fb3b2b4e30bf4 Mon Sep 17 00:00:00 2001
From: Andrew Whitwham <whitwham@users.noreply.github.com>
Date: Thu, 15 Aug 2024 15:18:55 +0100
Subject: [PATCH 58/76] Copyright updates for summer 2024. (#1822)

---
 Makefile            | 2 +-
 configure.ac        | 4 ++--
 faidx.c             | 2 +-
 hts_expr.c          | 2 +-
 hts_probe_cc.sh     | 2 +-
 htslib/bgzf.h       | 2 +-
 htslib/cram.h       | 2 +-
 htslib/hts_defs.h   | 2 +-
 htslib/kstring.h    | 2 +-
 sam_internal.h      | 2 +-
 test/test_bgzf.c    | 2 +-
 test/test_kstring.c | 2 +-
 12 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index 8cea55554..b9f6e37f4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile for htslib, a C library for high-throughput sequencing data formats.
 #
-#    Copyright (C) 2013-2023 Genome Research Ltd.
+#    Copyright (C) 2013-2024 Genome Research Ltd.
 #
 #    Author: John Marshall <jm18@sanger.ac.uk>
 #
diff --git a/configure.ac b/configure.ac
index 19b48b5e3..87e928d47 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,6 +1,6 @@
 # Configure script for htslib, a C library for high-throughput sequencing data.
 #
-#    Copyright (C) 2015-2023 Genome Research Ltd.
+#    Copyright (C) 2015-2024 Genome Research Ltd.
 #
 #    Author: John Marshall <jm18@sanger.ac.uk>
 #
@@ -35,7 +35,7 @@ m4_include([m4/hts_hide_dynamic_syms.m4])
 m4_include([m4/pkg.m4])
 
 dnl Copyright notice to be copied into the generated configure script
-AC_COPYRIGHT([Portions copyright (C) 2020-2023 Genome Research Ltd.
+AC_COPYRIGHT([Portions copyright (C) 2020-2024 Genome Research Ltd.
 
 This configure script is free software: you are free to change and
 redistribute it.  There is NO WARRANTY, to the extent permitted by law.])
diff --git a/faidx.c b/faidx.c
index 2e8968304..ed39c0ca0 100644
--- a/faidx.c
+++ b/faidx.c
@@ -1,6 +1,6 @@
 /*  faidx.c -- FASTA and FASTQ random access.
 
-    Copyright (C) 2008, 2009, 2013-2020, 2022 Genome Research Ltd.
+    Copyright (C) 2008, 2009, 2013-2020, 2022, 2024 Genome Research Ltd.
     Portions copyright (C) 2011 Broad Institute.
 
     Author: Heng Li <lh3@sanger.ac.uk>
diff --git a/hts_expr.c b/hts_expr.c
index 0fdb3bc8b..dfd15b151 100644
--- a/hts_expr.c
+++ b/hts_expr.c
@@ -1,6 +1,6 @@
 /*  hts_expr.c -- filter expression parsing and processing.
 
-    Copyright (C) 2020-2022 Genome Research Ltd.
+    Copyright (C) 2020-2022, 2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh
index eaa19470e..c9fc0a821 100755
--- a/hts_probe_cc.sh
+++ b/hts_probe_cc.sh
@@ -2,7 +2,7 @@
 
 # Check compiler options for non-configure builds and create Makefile fragment
 #
-#    Copyright (C) 2022-2023 Genome Research Ltd.
+#    Copyright (C) 2022-2024 Genome Research Ltd.
 #
 #    Author: Rob Davies <rmd@sanger.ac.uk>
 #
diff --git a/htslib/bgzf.h b/htslib/bgzf.h
index c6ce7c172..4325d8e1c 100644
--- a/htslib/bgzf.h
+++ b/htslib/bgzf.h
@@ -3,7 +3,7 @@
 /*
    Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
                  2011, 2012 Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2023 Genome Research Ltd
+   Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2024 Genome Research Ltd
 
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
diff --git a/htslib/cram.h b/htslib/cram.h
index 841e4a9b6..ddc44bbba 100644
--- a/htslib/cram.h
+++ b/htslib/cram.h
@@ -1,7 +1,7 @@
 /// @file htslib/cram.h
 /// CRAM format-specific API functions.
 /*
-    Copyright (C) 2015, 2016, 2018-2020, 2022-2023 Genome Research Ltd.
+    Copyright (C) 2015, 2016, 2018-2020, 2022-2024 Genome Research Ltd.
 
     Author: James Bonfield <jkb@sanger.ac.uk>
 
diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h
index e714e8fda..b5cded341 100644
--- a/htslib/hts_defs.h
+++ b/htslib/hts_defs.h
@@ -1,6 +1,6 @@
 /*  hts_defs.h -- Miscellaneous definitions.
 
-    Copyright (C) 2013-2015,2017, 2019-2020 Genome Research Ltd.
+    Copyright (C) 2013-2015,2017, 2019-2020, 2024 Genome Research Ltd.
 
     Author: John Marshall <jm18@sanger.ac.uk>
 
diff --git a/htslib/kstring.h b/htslib/kstring.h
index 0a3efb7d2..ebb2f9363 100644
--- a/htslib/kstring.h
+++ b/htslib/kstring.h
@@ -1,7 +1,7 @@
 /* The MIT License
 
    Copyright (C) 2011 by Attractive Chaos <attractor@live.co.uk>
-   Copyright (C) 2013-2014, 2016, 2018-2020, 2022 Genome Research Ltd.
+   Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024 Genome Research Ltd.
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
diff --git a/sam_internal.h b/sam_internal.h
index 8f701f337..750c597b2 100644
--- a/sam_internal.h
+++ b/sam_internal.h
@@ -1,6 +1,6 @@
 /*  sam_internal.h -- internal functions; not part of the public API.
 
-    Copyright (C) 2019-2020 Genome Research Ltd.
+    Copyright (C) 2019-2020, 2023-2024 Genome Research Ltd.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/test/test_bgzf.c b/test/test_bgzf.c
index 244ababc5..bda68d1e9 100644
--- a/test/test_bgzf.c
+++ b/test/test_bgzf.c
@@ -1,6 +1,6 @@
 /* test/test_bgzf.c -- bgzf unit tests
 
-   Copyright (C) 2017, 2019, 2022-2023 Genome Research Ltd
+   Copyright (C) 2017, 2019, 2022-2024 Genome Research Ltd
 
    Author: Robert Davies <rmd@sanger.ac.uk>
 
diff --git a/test/test_kstring.c b/test/test_kstring.c
index f942656f1..8b6188b6e 100644
--- a/test/test_kstring.c
+++ b/test/test_kstring.c
@@ -1,6 +1,6 @@
 /*  test_kstring.c -- kstring unit tests
 
-    Copyright (C) 2018, 2020 Genome Research Ltd.
+    Copyright (C) 2018, 2020, 2024 Genome Research Ltd.
 
     Author: Rob Davies <rmd@sanger.ac.uk>
 

From b14639f9eb6ca03f5f24694cad8f13e37b0bb343 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 12 Aug 2024 09:38:47 +0100
Subject: [PATCH 59/76] Check underlying hopen() worked for preload: URLs

Otherwise, we might try to hread() a NULL pointer.

Credit to OSS_Fuzz
Fixes oss-fuzz id 71069
---
 hfile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hfile.c b/hfile.c
index 1241dcccb..552b71774 100644
--- a/hfile.c
+++ b/hfile.c
@@ -703,7 +703,7 @@ static int is_preload_url_remote(const char *url){
 
 static hFILE *hopen_preload(const char *url, const char *mode){
     hFILE* fp = hopen(url + 8, mode);
-    return hpreload(fp);
+    return fp ? hpreload(fp) : NULL;
 }
 
 hFILE *hdopen(int fd, const char *mode)

From ffdda1e82ec86c40e7f3ccdc8712b9ba9dc14e6c Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 18 Jul 2024 14:59:31 +0100
Subject: [PATCH 60/76] Add ubsan to the asan CI check.

We already have address sanitizer, so add "undefined" into the mix.

We also have to tweak a few warnings when using ubsan.
---
 .cirrus.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 5d2e3b6a7..d5d06bcff 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -73,9 +73,11 @@ gcc_task:
        USE_CONFIG: no
     - environment:
        USE_CONFIG: yes
-       CFLAGS: -std=c99 -pedantic -Wformat=2 -fsanitize=address
-       LDFLAGS: -fsanitize=address
+       # ubsan is incompatible with some -Wformat opts so we do that on clang.
+       CFLAGS: -fsanitize=address,undefined -DHTS_ALLOW_UNALIGNED=0 -Wno-format-truncation -Wno-format-overflow
+       LDFLAGS: -fsanitize=address,undefined
        USE_LIBDEFLATE: yes
+       UBSAN_OPTIONS: print_stacktrace=1:halt_on_error=1
 
   install_script: |
     apt-get update
@@ -108,8 +110,9 @@ ubuntu_task:
        USE_CONFIG: yes
        DO_UNTRACKED_FILE_CHECK: yes
     - environment:
+       # Cirrus-CI's clang isn't installed with ubsan, so we do that in gcc
        USE_CONFIG: yes
-       CFLAGS: -g -Wall -O3
+       CFLAGS: -std=c99 -pedantic -Wformat -g -Wall -O3
        USE_LIBDEFLATE: yes
 
   # NB: we could consider building a docker image with these

From 0fde269d94e56530b9b3eb2c910171a9bdc785fe Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 18 Jul 2024 16:41:48 +0100
Subject: [PATCH 61/76] Report the compiler and version when building.

Useful when diagnosing CI systems.
---
 .cirrus.yml                         |  1 +
 .github/workflows/windows-build.yml |  1 +
 Makefile                            | 11 +++++++++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index d5d06bcff..c144e75b7 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -40,6 +40,7 @@ compile_template: &COMPILE
     else
       MAKE_OPTS=-e
     fi
+    make cc-version
     if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then
       make maintainer-check
     fi
diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index 3d818318c..78520fcc7 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -30,6 +30,7 @@ jobs:
         export MSYSTEM=MINGW64
         autoreconf -i
         ./configure
+        make cc-version
         make -j6
     - name: Check Htslib
       shell: msys2 {0}
diff --git a/Makefile b/Makefile
index b9f6e37f4..70de0d778 100644
--- a/Makefile
+++ b/Makefile
@@ -113,8 +113,14 @@ BUILT_THRASH_PROGRAMS = \
 	test/thrash_threads6 \
 	test/thrash_threads7
 
-all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS) \
-     htslib_static.mk htslib-uninstalled.pc
+all: lib-static lib-shared $(BUILT_PROGRAMS) plugins \
+	$(BUILT_TEST_PROGRAMS) htslib_static.mk htslib-uninstalled.pc
+
+# Report compiler and version
+cc-version:
+	-@$(CC) --version  2>/dev/null || true
+	-@$(CC) --qversion 2>/dev/null || true
+	-@$(CC) -V         2>/dev/null || true
 
 ALL_CPPFLAGS = -I. $(CPPFLAGS)
 
@@ -996,3 +1002,4 @@ force:
 .PHONY: clean-dylib install-dylib
 .PHONY: test_htscodecs_rans4x8 test_htscodecs_rans4x16 test_htscodecs_arith
 .PHONY: test_htscodecs_tok3 test_htscodecs_fqzcomp test_htscodecs_varint
+.PHONY: cc-version

From d465874b6a54507e8dc8d24454bca5bfd8cfe410 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 15 Aug 2024 14:40:16 +0100
Subject: [PATCH 62/76] Add Cirrus CI -Wformat=2 checks to Rocky Linux.

We lost level 2 when we swapped the sanitizers between ubuntu gcc and
clang, as Clang's -Wformat=2 is too spammy and we couldn't keep it on
gcc as it breaks the sanitizers.  So this puts it onto a third build
instead.
---
 .cirrus.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index c144e75b7..9d80ea07d 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -141,7 +141,7 @@ rocky_task:
     LC_ALL: C
     CIRRUS_CLONE_DEPTH: 1
     USE_CONFIG: yes
-    CFLAGS: -std=gnu90
+    CFLAGS: -std=gnu90 -Wformat -Wformat=2
 
   # NB: we could consider building a docker image with these
   # preinstalled and specifying that instead, to speed up testing.

From 8e3ef4b97cf63fdd0fd95c1b09e3e74da9edd7d4 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 15 Aug 2024 16:14:56 +0100
Subject: [PATCH 63/76] Add HTS_FORMAT checks to a variety of places.

These are needed in newer clangs with -Wformat=2.
---
 .cirrus.yml                      | 2 +-
 bgzip.c                          | 4 ++--
 htsfile.c                        | 2 +-
 htslib/vcf.h                     | 3 ++-
 test/hfile.c                     | 3 ++-
 test/test-bcf-sr.c               | 2 +-
 test/test-bcf-translate.c        | 2 +-
 test/test-bcf_set_variant_type.c | 2 +-
 test/test-vcf-api.c              | 2 +-
 9 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 9d80ea07d..61f094c2b 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -113,7 +113,7 @@ ubuntu_task:
     - environment:
        # Cirrus-CI's clang isn't installed with ubsan, so we do that in gcc
        USE_CONFIG: yes
-       CFLAGS: -std=c99 -pedantic -Wformat -g -Wall -O3
+       CFLAGS: -std=c99 -pedantic -Wformat -Wformat=2 -g -Wall -O3
        USE_LIBDEFLATE: yes
 
   # NB: we could consider building a docker image with these
diff --git a/bgzip.c b/bgzip.c
index d795c80a9..740aef117 100644
--- a/bgzip.c
+++ b/bgzip.c
@@ -48,7 +48,7 @@
 
 static const int WINDOW_SIZE = BGZF_BLOCK_SIZE;
 
-static void error(const char *format, ...)
+static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
@@ -689,7 +689,7 @@ int main(int argc, char **argv)
                     if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 )
                         error("Could not load index: %s.gzi\n", argv[optind]);
                 }
-                if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
+                if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %ld-th (uncompressd) byte\n", start);
             }
 
             if (threads > 1)
diff --git a/htsfile.c b/htsfile.c
index 9af4ae31b..25af3f584 100644
--- a/htsfile.c
+++ b/htsfile.c
@@ -46,7 +46,7 @@ int show_headers = 1;
 int verbose = 0;
 int status = EXIT_SUCCESS;  /* Exit status from main */
 
-void error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     int err = errno;
     va_list args;
diff --git a/htslib/vcf.h b/htslib/vcf.h
index e60911ab5..9a36cab05 100644
--- a/htslib/vcf.h
+++ b/htslib/vcf.h
@@ -596,7 +596,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
     int bcf_hdr_append(bcf_hdr_t *h, const char *line);
 
     HTSLIB_EXPORT
-    int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...);
+    int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
+    HTS_FORMAT(HTS_PRINTF_FMT, 2, 3);
 
     /** VCF version, e.g. VCFv4.2 */
     HTSLIB_EXPORT
diff --git a/test/hfile.c b/test/hfile.c
index 8f06a971f..741cf7a8d 100644
--- a/test/hfile.c
+++ b/test/hfile.c
@@ -35,7 +35,8 @@ DEALINGS IN THE SOFTWARE.  */
 #include "../htslib/hts_defs.h"
 #include "../htslib/kstring.h"
 
-void HTS_NORETURN fail(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN
+fail(const char *format, ...)
 {
     int err = errno;
     va_list args;
diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c
index 80daf0423..b4943b5ef 100644
--- a/test/test-bcf-sr.c
+++ b/test/test-bcf-sr.c
@@ -40,7 +40,7 @@
 #include "../htslib/hts.h"
 #include "../htslib/vcf.h"
 
-void error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
diff --git a/test/test-bcf-translate.c b/test/test-bcf-translate.c
index c2f069e39..263e71eb8 100644
--- a/test/test-bcf-translate.c
+++ b/test/test-bcf-translate.c
@@ -29,7 +29,7 @@
 
 #include "../htslib/vcf.h"
 
-void error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
diff --git a/test/test-bcf_set_variant_type.c b/test/test-bcf_set_variant_type.c
index 3688609f6..eb12ecde3 100644
--- a/test/test-bcf_set_variant_type.c
+++ b/test/test-bcf_set_variant_type.c
@@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include "../htslib/hts.h"
 #include "../vcf.c"
 
-void error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c
index ff16fa194..b86b71d99 100644
--- a/test/test-vcf-api.c
+++ b/test/test-vcf-api.c
@@ -33,7 +33,7 @@ DEALINGS IN THE SOFTWARE.  */
 #include "../htslib/kstring.h"
 #include "../htslib/kseq.h"
 
-void error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);

From cf6e166b24d1534ff38e30f7d685242de2bc8b59 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Thu, 15 Aug 2024 14:45:23 +0100
Subject: [PATCH 64/76] Fix hts_hfile() for compressed text_format files

SAM and VCF files opened for write both start with
htsFile::format.format set to text_format, and then update
it after writing the header.  This means it's possible to open
a text_format file with compression, so hts_hfile() needs to
handle it in the same way as sam and vcf.

Noticed when hts_set_opt(fpout, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE)
failed in samtools sort in the case where fpout was a compressed
sam file.  At this point fpout was a text_file, so the wrong
pointer was passed to hfile_set_blksize(), which luckily bailed
out and returned -1 before doing any damage to the incorrectly
passed-in structure.
---
 hts.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hts.c b/hts.c
index 42be3f21f..55af35cd4 100644
--- a/hts.c
+++ b/hts.c
@@ -1764,7 +1764,7 @@ static hFILE *hts_hfile(htsFile *fp) {
     case bcf:          // fall through
     case bam:          return bgzf_hfile(fp->fp.bgzf);
     case cram:         return cram_hfile(fp->fp.cram);
-    case text_format:  return fp->fp.hfile;
+    case text_format:  // fall through
     case vcf:          // fall through
     case fastq_format: // fall through
     case fasta_format: // fall through

From 5f49a1b98872698899802dee781cc109ff061765 Mon Sep 17 00:00:00 2001
From: John Marshall <jmarshall@hey.com>
Date: Fri, 19 Jul 2024 21:00:06 +1200
Subject: [PATCH 65/76] Remove samples/write_fast.c's unused sam_hdr_t and
 other minor fixes

The code confusingly never instantiated out_samhdr, and doesn't really
have a header to supply. It got away with this because writing FASTA/Q
doesn't actually need a header, so make that explicit.

Cast time_t, which isn't necessarily a long itself.

Use the usual incantation so that doing "make clean" twice completes
without diagnostics even when the $(PRGS) files already don't exist.
---
 samples/Makefile     | 4 +---
 samples/write_fast.c | 9 +++------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/samples/Makefile b/samples/Makefile
index ecbede4c5..ee632e3ad 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -114,6 +114,4 @@ qtask_unordered: qtask_unordered.c
 clean:
 	find . -name "*.o" | xargs rm -rf
 	find . -name "*.dSYM" | xargs rm -rf
-	rm $(PRGS)
-
-
+	-rm -f $(PRGS)
diff --git a/samples/write_fast.c b/samples/write_fast.c
index 626c693f6..95d919fd0 100644
--- a/samples/write_fast.c
+++ b/samples/write_fast.c
@@ -53,7 +53,6 @@ int main(int argc, char *argv[])
     const char *outname = NULL;             //output file name
     int ret = EXIT_FAILURE;
     samFile *outfile = NULL;                //sam file
-    sam_hdr_t *out_samhdr = NULL;           //header of file
     bam1_t *bamdata = NULL;                 //to hold the read data
     char mode[4] = "a";
     const char *data = NULL, *qual = NULL;  //ref data and quality
@@ -93,22 +92,20 @@ int main(int argc, char *argv[])
     sam_open_format(outname, mode, fmt);
     */
 
-    snprintf(name, sizeof(name), "Test_%ld", time(NULL));
+    snprintf(name, sizeof(name), "Test_%ld", (long) time(NULL));
     //data
     if (bam_set1(bamdata, strlen(name), name, BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, strlen(data), data, qual, 0) < 0) {
         printf("Failed to set data\n");
         goto end;
     }
-    if (sam_write1(outfile, out_samhdr, bamdata) < 0) {
+    //as we write only FASTA/FASTQ, we can get away without providing headers
+    if (sam_write1(outfile, NULL, bamdata) < 0) {
         printf("Failed to write data\n");
         goto end;
     }
     ret = EXIT_SUCCESS;
 end:
     //clean up
-    if (out_samhdr) {
-        sam_hdr_destroy(out_samhdr);
-    }
     if (outfile) {
         sam_close(outfile);
     }

From f399d69fd03cd201c4aaa7e645462cf0ac6bf6a7 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 22 Aug 2024 14:16:02 +0100
Subject: [PATCH 66/76] Update htscodecs to 1.6.1

---
 htscodecs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htscodecs b/htscodecs
index 16548914a..51794289a 160000
--- a/htscodecs
+++ b/htscodecs
@@ -1 +1 @@
-Subproject commit 16548914ada64cf77acd7c64562b085ed1a4ccd9
+Subproject commit 51794289ac47455209c333182b6768f99a613948

From 5d54aa58ebedf38696259cc3c7164bf4ff964097 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 22 Aug 2024 14:16:25 +0100
Subject: [PATCH 67/76] Add more warnings and reenable -Werror.

These find an old bug-let in hts_parse_format():
The buffer size is 8, but it includes \0 so the string would have been
truncated, causing the strcmp to fail for "fastq.gz".

Curiously gcc 10 and 11 spot this, but not gcc 12 or 13, nor clang.

Some builds didn't have -Werror enabled, such as those without
./configure and on Windows.
---
 .cirrus.yml                         | 3 ++-
 .github/workflows/windows-build.yml | 2 +-
 hts.c                               | 2 +-
 test/test-bcf-sr.c                  | 1 +
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 61f094c2b..a1c965e25 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -72,6 +72,7 @@ gcc_task:
        DO_MAINTAINER_CHECKS: yes
        DO_UNTRACKED_FILE_CHECK: yes
        USE_CONFIG: no
+       CFLAGS: -g -O2 -Wall -Werror -fvisibility=hidden
     - environment:
        USE_CONFIG: yes
        # ubsan is incompatible with some -Wformat opts so we do that on clang.
@@ -141,7 +142,7 @@ rocky_task:
     LC_ALL: C
     CIRRUS_CLONE_DEPTH: 1
     USE_CONFIG: yes
-    CFLAGS: -std=gnu90 -Wformat -Wformat=2
+    CFLAGS: -g -O3 -std=gnu90 -Wformat -Wformat=2 -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-missing-field-initializers -Wno-empty-body
 
   # NB: we could consider building a docker image with these
   # preinstalled and specifying that instead, to speed up testing.
diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index 78520fcc7..bf6f5ae53 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -29,7 +29,7 @@ jobs:
         export PATH="/mingw64/bin:$PATH:/c/Program Files/Git/bin"
         export MSYSTEM=MINGW64
         autoreconf -i
-        ./configure
+        ./configure --enable-werror
         make cc-version
         make -j6
     - name: Check Htslib
diff --git a/hts.c b/hts.c
index 55af35cd4..a8a8bead2 100644
--- a/hts.c
+++ b/hts.c
@@ -1322,7 +1322,7 @@ int hts_parse_opt_list(htsFormat *fmt, const char *str) {
  *        -1 on failure.
  */
 int hts_parse_format(htsFormat *format, const char *str) {
-    char fmt[8];
+    char fmt[9];
     const char *cp = scan_keyword(str, ',', fmt, sizeof fmt);
 
     format->version.minor = 0; // unknown
diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c
index b4943b5ef..e24954685 100644
--- a/test/test-bcf-sr.c
+++ b/test/test-bcf-sr.c
@@ -179,6 +179,7 @@ int main(int argc, char *argv[])
                 break;
             case 'h':
                 usage(EXIT_SUCCESS);
+                // fall-through
             default: usage(EXIT_FAILURE);
         }
     }

From 5ce0f37115be6064c8006317f44b302030b6ff64 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 23 Aug 2024 18:26:23 +0100
Subject: [PATCH 68/76] Fix more warnings and adjust Cirrus CI rules

Change hts_close() to hts_close_abruptly() in bgzip, to fix an
empty body warning, allowing removal of -Wno-empty-body in CI
tests.

Fix signed compare in htslib/bgzf.h, which caused a warning in
the maintainer-check usepublic.cpp check.  It appears that -Wall
does not allow signed-compare in c++.

Make CFLAGS settings more consistent in .cirrus.yml, and ensure
$MAKE_OPTS is consistently applied in all make invocations.  The
latter is necessary because "make cc-version" and
"make maintainer-check" can trigger production of some .mk files,
and we want them to be made in the same way as if we ran the full
build.

Switch MacOS build to Sonoma as Ventura is no longer supported
by Cirrus.
---
 .cirrus.yml   | 24 +++++++++++++-----------
 bgzip.c       |  3 +--
 htslib/bgzf.h |  3 ++-
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index a1c965e25..6da99dde0 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -35,22 +35,22 @@ compile_template: &COMPILE
     if test "$USE_CONFIG" = "yes"; then
       MAKE_OPTS=
       autoreconf -i
-      eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \
+      eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"$CFLAGS\" || \
         ( cat config.log; false )
     else
       MAKE_OPTS=-e
     fi
-    make cc-version
+    make cc-version $MAKE_OPTS
     if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then
-      make maintainer-check
+      make maintainer-check $MAKE_OPTS
     fi
     make -j 4 $MAKE_OPTS
 
 test_template: &TEST
   test_script: |
-    make test-shlib-exports
-    make test
-    if test "x$DO_UNTRACKED_FILE_CHECK" = "xyes"; then make check-untracked ; fi
+    make test-shlib-exports $MAKE_OPTS
+    make test $MAKE_OPTS
+    if test "x$DO_UNTRACKED_FILE_CHECK" = "xyes"; then make check-untracked $MAKE_OPTS ; fi
 
 #--------------------------------------------------
 # Task: linux builds.
@@ -76,7 +76,7 @@ gcc_task:
     - environment:
        USE_CONFIG: yes
        # ubsan is incompatible with some -Wformat opts so we do that on clang.
-       CFLAGS: -fsanitize=address,undefined -DHTS_ALLOW_UNALIGNED=0 -Wno-format-truncation -Wno-format-overflow
+       CFLAGS: -g -O3 -fsanitize=address,undefined -DHTS_ALLOW_UNALIGNED=0 -Wno-format-truncation -Wno-format-overflow
        LDFLAGS: -fsanitize=address,undefined
        USE_LIBDEFLATE: yes
        UBSAN_OPTIONS: print_stacktrace=1:halt_on_error=1
@@ -110,11 +110,12 @@ ubuntu_task:
   matrix:
     - environment:
        USE_CONFIG: yes
+       CFLAGS: -g -O3
        DO_UNTRACKED_FILE_CHECK: yes
     - environment:
        # Cirrus-CI's clang isn't installed with ubsan, so we do that in gcc
        USE_CONFIG: yes
-       CFLAGS: -std=c99 -pedantic -Wformat -Wformat=2 -g -Wall -O3
+       CFLAGS: -g -O3 -std=c99 -pedantic -Wall -Wformat -Wformat=2
        USE_LIBDEFLATE: yes
 
   # NB: we could consider building a docker image with these
@@ -142,7 +143,7 @@ rocky_task:
     LC_ALL: C
     CIRRUS_CLONE_DEPTH: 1
     USE_CONFIG: yes
-    CFLAGS: -g -O3 -std=gnu90 -Wformat -Wformat=2 -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-missing-field-initializers -Wno-empty-body
+    CFLAGS: -g -O3 -std=gnu90 -Wall -Wformat -Wformat=2 -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-missing-field-initializers
 
   # NB: we could consider building a docker image with these
   # preinstalled and specifying that instead, to speed up testing.
@@ -187,11 +188,10 @@ arm_ubuntu_task:
 macosx_task:
   name: macosx + clang
   macos_instance:
-    image: ghcr.io/cirruslabs/macos-ventura-base:latest
+    image: ghcr.io/cirruslabs/macos-runner:sonoma
 
   environment:
     CC: clang
-    CFLAGS: "-Wall -arch arm64 -arch x86_64"
     LDFLAGS: "-arch arm64 -arch x86_64"
     LIBDEFLATE_CFLAGS: "-arch arm64 -arch x86_64"
     LC_ALL: C
@@ -200,9 +200,11 @@ macosx_task:
   matrix:
     - environment:
        USE_CONFIG: no
+       CFLAGS: "-g -O3 -Wall -Werror -arch arm64 -arch x86_64"
     - environment:
        USE_CONFIG: yes
        USE_LIBDEFLATE: yes
+       CFLAGS: "-g -O3 -Wall -arch arm64 -arch x86_64"
 
   package_install_script: |
     HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git \
diff --git a/bgzip.c b/bgzip.c
index 740aef117..687b29d47 100644
--- a/bgzip.c
+++ b/bgzip.c
@@ -362,8 +362,7 @@ int main(int argc, char **argv)
                         }
                         else {
                             ret = 2;                        //explicit N - no overwrite, continue and return 2
-                            if (hclose(f_src) < 0)
-                                ;                           //ignoring return value
+                            hclose_abruptly(f_src);
                             free(name);
                             continue;
                         }
diff --git a/htslib/bgzf.h b/htslib/bgzf.h
index 4325d8e1c..87d4c6a3b 100644
--- a/htslib/bgzf.h
+++ b/htslib/bgzf.h
@@ -182,7 +182,8 @@ static inline ssize_t bgzf_read_small(BGZF *fp, void *data, size_t length) {
  */
 static inline
 ssize_t bgzf_write_small(BGZF *fp, const void *data, size_t length) {
-    if (fp->is_compressed && BGZF_BLOCK_SIZE - fp->block_offset > length) {
+    if (fp->is_compressed
+        && (size_t) (BGZF_BLOCK_SIZE - fp->block_offset) > length) {
         // Short cut the common and easy mode
         memcpy((uint8_t *)fp->uncompressed_block + fp->block_offset,
                data, length);

From b93a8ce7618aabb1ead506b4e0a57b43d0464405 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 23 Aug 2024 19:14:30 +0100
Subject: [PATCH 69/76] Add HTS_NORETURN to test-bcf-sr usage and error
 functions

Hat-tip to John Marshall
---
 test/test-bcf-sr.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c
index e24954685..0fb59905c 100644
--- a/test/test-bcf-sr.c
+++ b/test/test-bcf-sr.c
@@ -36,11 +36,13 @@
 #include <strings.h>
 #include <errno.h>
 
+#include "../htslib/hts_defs.h"
 #include "../htslib/synced_bcf_reader.h"
 #include "../htslib/hts.h"
 #include "../htslib/vcf.h"
 
-void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
+void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN
+error(const char *format, ...)
 {
     va_list ap;
     va_start(ap, format);
@@ -49,7 +51,7 @@ void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) error(const char *format, ...)
     exit(EXIT_FAILURE);
 }
 
-void usage(int exit_code)
+void HTS_NORETURN usage(int exit_code)
 {
     fprintf(stderr, "Usage: test-bcf-sr [OPTIONS] vcf-list.txt\n");
     fprintf(stderr, "       test-bcf-sr [OPTIONS] -args file1.bcf [...]\n");
@@ -179,7 +181,6 @@ int main(int argc, char *argv[])
                 break;
             case 'h':
                 usage(EXIT_SUCCESS);
-                // fall-through
             default: usage(EXIT_FAILURE);
         }
     }

From c12496f3b16f441f7641fac62cd24db932cb81e5 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 23 Aug 2024 19:51:42 +0100
Subject: [PATCH 70/76] Remove -lz from test link lines

It's already in $(LIBS) and the MacOS linker complains if it
appears more than once
---
 Makefile | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index 70de0d778..94bba57d1 100644
--- a/Makefile
+++ b/Makefile
@@ -666,22 +666,22 @@ test/sam: test/sam.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/sam.o libhts.a $(LIBS) -lpthread
 
 test/test_bgzf: test/test_bgzf.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a $(LIBS) -lpthread
 
 test/test_expr: test/test_expr.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a $(LIBS) -lpthread
 
 test/test_faidx: test/test_faidx.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a $(LIBS) -lpthread
 
 test/test_kfunc: test/test_kfunc.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a $(LIBS) -lpthread
 
 test/test_khash: test/test_khash.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_khash.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_khash.o libhts.a $(LIBS) -lpthread
 
 test/test_kstring: test/test_kstring.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a $(LIBS) -lpthread
 
 test/test_mod: test/test_mod.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_mod.o libhts.a $(LIBS) -lpthread
@@ -717,10 +717,10 @@ test/test-vcf-sweep: test/test-vcf-sweep.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test-vcf-sweep.o libhts.a $(LIBS) -lpthread
 
 test/test-bcf-sr: test/test-bcf-sr.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test-bcf-sr.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test-bcf-sr.o libhts.a $(LIBS) -lpthread
 
 test/test-bcf-translate: test/test-bcf-translate.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a $(LIBS) -lpthread
 
 test/test_introspection: test/test_introspection.o libhts.a
 	$(CC) $(LDFLAGS) -o $@ test/test_introspection.o libhts.a $(LIBS) -lpthread
@@ -815,25 +815,25 @@ test/usepublic.o: test/usepublic.cpp config.h $(htslib_bgzf_h) $(htslib_cram_h)
 
 
 test/thrash_threads1: test/thrash_threads1.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads1.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads1.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads2: test/thrash_threads2.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads2.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads2.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads3: test/thrash_threads3.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads3.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads3.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads4: test/thrash_threads4.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads4.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads4.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads5: test/thrash_threads5.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads5.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads5.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads6: test/thrash_threads6.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads6.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads6.o libhts.a $(LIBS) -lpthread
 
 test/thrash_threads7: test/thrash_threads7.o libhts.a
-	$(CC) $(LDFLAGS) -o $@ test/thrash_threads7.o libhts.a -lz $(LIBS) -lpthread
+	$(CC) $(LDFLAGS) -o $@ test/thrash_threads7.o libhts.a $(LIBS) -lpthread
 
 test_thrash: $(BUILT_THRASH_PROGRAMS)
 

From ac0ee54638900ad337a8cbffc12887397a05b19f Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 4 Sep 2024 16:33:48 +0100
Subject: [PATCH 71/76] Add NEWS updates (PR #1818)

* Add NEWS updates
* Document the future plan to make CRAM3.1 the default version

---------

Co-authored-by: Rob Davies <rmd+git@sanger.ac.uk>
---
 NEWS | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/NEWS b/NEWS
index 8af5fc63f..e6962e25f 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,124 @@
 Noteworthy changes in release a.b
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+The primary user-visible changes in this release are updates to the
+annot-tsv tool and some speed improvements.  Full details of other
+changes and bugs fixed are below.
+
+Notice: this is the last SAMtools / HTSlib release where CRAM 3.0 will be
+the default CRAM version.  From the next we will change to CRAM 3.1
+unless the version is explicitly specified, for example using
+"samtools view -O cram,version=3.0".
+
+
+Updates
+-------
+
+* Extend annot-tsv with several new command line options.
+    --delim permits use of other delimiters.
+    --headers for selection of other header formats.
+    --no-header-idx to suppress column index numbers in header.
+  Also removed -h as it is now short for --headers.  Note --help
+  still works. (PR #1779)
+
+* Allow annot-tsv -a to rename annotations. (PR #1709)
+
+* Extend annot-tsv --overlap to be able to specify the overlap
+  fraction separately for source and target. (PR #1811)
+
+* Added new APIs to facilitate low-level CRAM container manipulations,
+  used by   the new "samtools cat" region filtering code. Functions are
+  cram_container_get_coords(), cram_filter_container(),
+  cram_index_extents(), cram_container_num2offset(),
+  cram_num_containers(), cram_num_containers_between(),
+  cram_skip_container().
+  Also improved cram_index_query() and cram_index_query() to cope with
+  HTS_IDX_NOCOOR regions.  (PR #1771)
+
+* Bgzip now retains file modification and access times when
+  compressing and decompressing. (PR #1727, fixes #1718.  Requested by
+  Gert Hulselmans.)
+
+* Use FNV1a for string hashing in khash.  The old algorithm was
+  particularly weak with base-64 style strings and lead to a large
+  number of collisions.  (PR #1806.  Fixes   samtools/samtools#2066,
+  reported by Hans-Joachim Ruscheweyh)
+
+* Improve the speed of the nibble2base() function on Intel (PR
+  #1667, PR #1764, PR #1786, PR #1802, thanks to Ruben Vorderman) and
+  ARM (PR #1795, thanks to John Marshall).
+
+* Speed up bgzf_read().  While this does not reduce CPU significantly,
+  it does increase the maximum parallelism available permitting 10-15%
+  faster decoding. (PR #1772, PR #1800, Issue #1798)
+
+* Speed up faidx by use of better isgraph methods (PR #1797) and
+  whole-line reading (PR #1799, thanks to John Marshall).
+
+* Speed up kputll() function, speeding up BAM -> SAM conversion by
+  about 5% and also samtools depth.  (PR #1805)
+
+* Added more example code, covering fasta/fastq indexing, tabix
+  indexing and use of the thread pool. (PR #1666)
+
+Build Changes
+-------------
+
+* Code warning fixes for pedantic compilers (PR #1777) and avoid
+  some undefined behaviour (PR #1810, PR #1816, PR #1828).
+
+* Windows based CI has been migrated from AppVeyor to GitHub Actions.
+  (PR #1796, PR #1803, PR #1808)
+
+* Miscellaneous minor build infrastructure and code fixes.
+  (PR #1807, PR #1829, both thanks to John Marshall)
+
+* Updated htscodecs submodule to version 1.6.1 (PR #1828)
+
+Bug fixes
+---------
+
+* Fix small OSS-Fuzz reported issues with CRAM encoding and long
+  CIGARS and/or illegal positions. (PR #1775, PR #1801, PR #1817)
+
+* Stricter limits on POS / MPOS / TLEN in sam_parse1().  This fixes
+  a signed overflow reported by OSS-Fuzz and should help prevent other
+  as-yet undetected bugs. (PR #1812)
+
+* Check that the underlying file open worked for preload: URLs.  Fixes
+  a NULL pointer dereference reported by OSS-Fuzz. (PR #1821)
+
+* Fix an infinite loop in hts_itr_query() when given extremely large
+  positions which cause integer overflow.  Also adds hts_bin_maxpos()
+  and hts_idx_maxpos() functions.
+  (PR #1774, thanks to John Marshall and reported by Jesus Alberto
+  Munoz Mesa)
+
+* Fix an out of bounds read in hts_itr_multi_next() when switching
+  chromosomes.  This bug is present in releases 1.11 to 1.20.
+  (PR #1788. Fixes samtools/samtools#2063, reported by acorvelo)
+
+* Work around parsing problems with colons in CHROM names.
+  Fixes samtools/bcftools#2139.  (PR #1781, John Marshall / James Bonfield)
+
+* Correct the CPU detection for Mac OS X 10.7.  cpuid is used by
+  htscodecs (see samtools/htscodecs#116), and the corresponding
+  changes in htslib are PR #1785.  Reported by Ryan Carsten Schmidt.
+
+* Make BAM zero-length intervals work the same as CRAM; permitted and
+  returning overlapping records. (PR #1787.  Fixes
+  samtools/samtools#2060, reported by acorvelo)
+
+* Warn in bgzf_getline() encounters UTF-16 data.
+  (PR #1487, thanks to John Marshall)
+
+* Replace assert() with abort() in BCF synced reader.  This is not an
+  ideal solution, but it gives consistent behaviour when compiling
+  with or without NDEBUG.  (PR #1791, thanks to Martin Pollard)
+
+* Fixed failure to change the write block size on compressed SAM or VCF
+  files due to an internal type confusion.  (PR #1826)
+
 Noteworthy changes in release 1.20 (15th April 2024)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From 2f3379d6c87797c51bbc2835de5bf317db959fd7 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Thu, 5 Sep 2024 16:21:21 +0100
Subject: [PATCH 72/76] Fix out-of-bounds read in cram_codec_iter_next()

cram_block_compression_hdr::tag_encoding_map[] has CRAM_MAP_HASH
elements, so the iterator should not go beyond that.
---
 cram/cram_external.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cram/cram_external.c b/cram/cram_external.c
index c6d7d66af..4943750dd 100644
--- a/cram/cram_external.c
+++ b/cram/cram_external.c
@@ -291,7 +291,7 @@ static cram_codec *cram_codec_iter_next(cram_codec_iter *iter,
             iter->curr_map = iter->curr_map->next;
             return cc;
         }
-    } while (iter->idx <= CRAM_MAP_HASH);
+    } while (iter->idx < CRAM_MAP_HASH);
 
     // End of codecs
     return NULL;

From f41fda4668930a170e02c34713ee059c1aba3c63 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Thu, 5 Sep 2024 15:25:32 +0100
Subject: [PATCH 73/76] Fix gawk-ism in Makefile

The awk script used to extract htscodec's version string (if
git describe fails or returns a hash) used a three-argument
form of "match", which is a GNU extension.  Rewrite for
better compatibility with other awk implementations,
notably "mawk", which supplies awk in some Cirrus-CI images.

Also fix the cirrus-ci badge, which linked to the wrong location.
---
 Makefile  | 2 +-
 README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 94bba57d1..11492b237 100644
--- a/Makefile
+++ b/Makefile
@@ -573,7 +573,7 @@ htscodecs/htscodecs/version.h: force
 	  vers=`cd $(srcdir)/htscodecs && git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \
 	  case "$$vers" in \
 	    v*) vers=$${vers#v} ;; \
-	    *) iv=`awk '/^AC_INIT/ { match($$0, /^AC_INIT\(htscodecs, *([0-9](\.[0-9])*)\)/, m); print substr($$0, m[1, "start"], m[1, "length"]) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \
+	    *) iv=`awk '/^AC_INIT\(htscodecs,/ { match($$0, /[0-9](\.[0-9])*/); print substr($$0, RSTART, RLENGTH) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \
 	  esac ; \
 	  if ! grep -s -q '"'"$$vers"'"' $@ ; then \
 	    echo 'Updating $@ : #define HTSCODECS_VERSION_TEXT "'"$$vers"'"' ; \
diff --git a/README.md b/README.md
index 0d1d85973..2906855ba 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-[![Build Status](https://api.cirrus-ci.com/github/samtools/htslib.svg?branch=develop)](https://api.cirrus-ci.com/github/samtools/htslib)
+[![Build Status](https://api.cirrus-ci.com/github/samtools/htslib.svg?branch=develop)](https://cirrus-ci.com/github/samtools/htslib)
 [![Build status](https://github.com/samtools/htslib/actions/workflows/windows-build.yml/badge.svg)](https://github.com/samtools/htslib/actions/workflows/windows-build.yml?query=branch%3Adevelop)
 [![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib)
 

From 86b0652bd2488e0ca08fc803aa299c855f8681a9 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Fri, 6 Sep 2024 10:23:19 +0100
Subject: [PATCH 74/76] Allow for more than one digit in htscodecs version
 number

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 11492b237..0c0a9676e 100644
--- a/Makefile
+++ b/Makefile
@@ -573,7 +573,7 @@ htscodecs/htscodecs/version.h: force
 	  vers=`cd $(srcdir)/htscodecs && git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \
 	  case "$$vers" in \
 	    v*) vers=$${vers#v} ;; \
-	    *) iv=`awk '/^AC_INIT\(htscodecs,/ { match($$0, /[0-9](\.[0-9])*/); print substr($$0, RSTART, RLENGTH) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \
+	    *) iv=`awk '/^AC_INIT\(htscodecs,/ { match($$0, /[0-9]+(\.[0-9]+)*/); print substr($$0, RSTART, RLENGTH) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \
 	  esac ; \
 	  if ! grep -s -q '"'"$$vers"'"' $@ ; then \
 	    echo 'Updating $@ : #define HTSCODECS_VERSION_TEXT "'"$$vers"'"' ; \

From bf118051d99ceebcf7809033d2aa00a68f107456 Mon Sep 17 00:00:00 2001
From: Rob Davies <rmd+git@sanger.ac.uk>
Date: Mon, 9 Sep 2024 09:25:48 +0100
Subject: [PATCH 75/76] Add last-minute NEWS items for release

---
 NEWS | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/NEWS b/NEWS
index e6962e25f..be8de6682 100644
--- a/NEWS
+++ b/NEWS
@@ -27,13 +27,16 @@ Updates
   fraction separately for source and target. (PR #1811)
 
 * Added new APIs to facilitate low-level CRAM container manipulations,
-  used by   the new "samtools cat" region filtering code. Functions are
-  cram_container_get_coords(), cram_filter_container(),
-  cram_index_extents(), cram_container_num2offset(),
-  cram_num_containers(), cram_num_containers_between(),
-  cram_skip_container().
-  Also improved cram_index_query() and cram_index_query() to cope with
-  HTS_IDX_NOCOOR regions.  (PR #1771)
+  used by the new "samtools cat" region filtering code. Functions are:
+    cram_container_get_coords()
+    cram_filter_container()
+    cram_index_extents()
+    cram_container_num2offset()
+    cram_container_offset2num()
+    cram_num_containers()
+    cram_num_containers_between()
+  Also improved cram_index_query() to cope with HTS_IDX_NOCOOR regions.
+  (PR #1771)
 
 * Bgzip now retains file modification and access times when
   compressing and decompressing. (PR #1727, fixes #1718.  Requested by
@@ -48,6 +51,9 @@ Updates
   #1667, PR #1764, PR #1786, PR #1802, thanks to Ruben Vorderman) and
   ARM (PR #1795, thanks to John Marshall).
 
+* bgzf_getline() will now warn if it encounters UTF-16 data.
+  (PR #1487, thanks to John Marshall)
+
 * Speed up bgzf_read().  While this does not reduce CPU significantly,
   it does increase the maximum parallelism available permitting 10-15%
   faster decoding. (PR #1772, PR #1800, Issue #1798)
@@ -75,6 +81,8 @@ Build Changes
 
 * Updated htscodecs submodule to version 1.6.1 (PR #1828)
 
+* Fixed an awk script in the Makefile that only worked with gawk. (PR #1831)
+
 Bug fixes
 ---------
 
@@ -109,9 +117,6 @@ Bug fixes
   returning overlapping records. (PR #1787.  Fixes
   samtools/samtools#2060, reported by acorvelo)
 
-* Warn in bgzf_getline() encounters UTF-16 data.
-  (PR #1487, thanks to John Marshall)
-
 * Replace assert() with abort() in BCF synced reader.  This is not an
   ideal solution, but it gives consistent behaviour when compiling
   with or without NDEBUG.  (PR #1791, thanks to Martin Pollard)
@@ -119,6 +124,8 @@ Bug fixes
 * Fixed failure to change the write block size on compressed SAM or VCF
   files due to an internal type confusion.  (PR #1826)
 
+* Fixed an out-of-bounds read in cram_codec_iter_next() (PR #1832)
+
 Noteworthy changes in release 1.20 (15th April 2024)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From b66c6d213ac8503142f4ea674af467f92587b7c5 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Thu, 12 Sep 2024 15:11:45 +0100
Subject: [PATCH 76/76] Fix on-the-fly indexing of VCF w.r.t virtual offsets.

When using bcftools view --write-index -o out.vcf.gz the virtual file
offsets can differ depending on whether we do a bgzf_tell before or
after a flush.  Specifically it could point to the last byte in the
current BGZF block or the first byte in the next BGZF block.
Ultimately both of these resolve to the same physical location, but in
some situations the former may mean attempting to read zero bytes (the
remainder of the bgzf block).  This has been known in the past to be
misinterpreted as an EOF.  (See samtools/samtools#1861)

It also means the contents of the index produced by --write-index and
a separate bcftools index command can yield different results, albeit
both representing the same data.

The fix for the samtools / bcftools issue above (samtools/htslib#1672)
when multi-threading inadvertently recreated the bug when not
multi-threading.

Fixes samtools/bcftools#2267
---
 NEWS  | 4 ++++
 vcf.c | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/NEWS b/NEWS
index be8de6682..4a573d91d 100644
--- a/NEWS
+++ b/NEWS
@@ -89,6 +89,10 @@ Bug fixes
 * Fix small OSS-Fuzz reported issues with CRAM encoding and long
   CIGARS and/or illegal positions. (PR #1775, PR #1801, PR #1817)
 
+* Fix issues with on-the-fly indexing of VCF/BCF (bcftools --write-index)
+  when not using multiple threads. (PR #1837. Fixes samtools/bcftools#2267,
+  reported by Giulio Genovese)
+
 * Stricter limits on POS / MPOS / TLEN in sam_parse1().  This fixes
   a signed overflow reported by OSS-Fuzz and should help prevent other
   as-yet undetected bugs. (PR #1812)
diff --git a/vcf.c b/vcf.c
index 7ce306f92..105c7539d 100644
--- a/vcf.c
+++ b/vcf.c
@@ -4238,6 +4238,8 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
     if ( fp->format.compression!=no_compression ) {
         if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
             return -1;
+        if (fp->idx && !fp->fp.bgzf->mt)
+            hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
         ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
     } else {
         ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);