Skip to content

Commit

Permalink
Read whole lines at once in fai_retrieve()
Browse files Browse the repository at this point in the history
Because fai_retrieve() is given only well-formatted input containing
lines of the same length, it already knows exactly where the base and
non-graphic characters are. So in general the interval to be read will
look like

    ......ATGCAT    (read last six bases and line terminator)
    ATGCATGCATGC    (read complete line including line terminator)
    ATGCATGCATGC    (read complete line including line terminator)
    ATGC........    (read first four base characters)

and can be read a line at a time instead of a character at a time,
with special handling for the partial first and last lines, and
discarding the terminator characters at the end of each line read.
  • Loading branch information
jmarshall authored and whitwham committed Jul 8, 2024
1 parent f3d401c commit 3f0479b
Showing 1 changed file with 47 additions and 18 deletions.
65 changes: 47 additions & 18 deletions faidx.c
Original file line number Diff line number Diff line change
Expand Up @@ -715,9 +715,8 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {

static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) {
char *s;
size_t l;
int c = 0;
char *buffer, *s;
ssize_t nread, remaining, firstline_len, firstline_blen;
int ret;

if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
Expand All @@ -743,27 +742,57 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
return NULL;
}

l = 0;
s = (char*)malloc((size_t) end - beg + 2);
if (!s) {
// Over-allocate so there is extra space for one end-of-line sequence
buffer = (char*)malloc((size_t) end - beg + val->line_len - val->line_blen + 1);
if (!buffer) {
*len = -1;
return NULL;
}

BGZF *fp = fai->bgzf;
while ( l < end - beg && (c=bgzf_getc(fp))>=0 )
if (isgraph(c)) s[l++] = c;
if (c < 0) {
hts_log_error("Failed to retrieve block: %s",
c == -1 ? "unexpected end of file" : "error reading file");
free(s);
*len = -1;
return NULL;
remaining = *len = end - beg;
firstline_blen = val->line_blen - beg % val->line_blen;

// Special case when the entire interval requested is within a single FASTA/Q line
if (remaining <= firstline_blen) {
nread = bgzf_read_small(fai->bgzf, buffer, remaining);
if (nread < remaining) goto error;
buffer[nread] = '\0';
return buffer;
}

s = buffer;
firstline_len = val->line_len - beg % val->line_blen;

// Read the (partial) first line and its line terminator, but increment s past the
// line contents only, so the terminator characters will be overwritten by the next line.
nread = bgzf_read_small(fai->bgzf, s, firstline_len);
if (nread < firstline_len) goto error;
s += firstline_blen;
remaining -= firstline_blen;

// Similarly read complete lines and their line terminator characters, but overwrite the latter.
while (remaining > val->line_blen) {
nread = bgzf_read_small(fai->bgzf, s, val->line_len);
if (nread < (ssize_t) val->line_len) goto error;
s += val->line_blen;
remaining -= val->line_blen;
}

s[l] = '\0';
*len = l;
return s;
if (remaining > 0) {
nread = bgzf_read_small(fai->bgzf, s, remaining);
if (nread < remaining) goto error;
s += remaining;
}

*s = '\0';
return buffer;

error:
hts_log_error("Failed to retrieve block: %s",
(nread == 0)? "unexpected end of file" : "error reading file");
free(buffer);
*len = -1;
return NULL;
}

static int fai_get_val(const faidx_t *fai, const char *str,
Expand Down

0 comments on commit 3f0479b

Please sign in to comment.