This repository has been archived by the owner on Aug 11, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathtabix.tex
121 lines (113 loc) · 4.76 KB
/
tabix.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
\documentclass[10pt]{article}
\usepackage{color}
\definecolor{gray}{rgb}{0.7,0.7,0.7}
\setlength{\topmargin}{0.0cm}
\setlength{\textheight}{21.5cm}
\setlength{\oddsidemargin}{0cm}
\setlength{\textwidth}{16.5cm}
\setlength{\columnsep}{0.6cm}
\begin{document}
\title{The Tabix index file format}
\author{Heng Li}
\date{}
\maketitle
\begin{center}
\begin{tabular}{|l|l|l|l|l|l|l|}
\hline
\multicolumn{4}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\
\hline\hline
\multicolumn{4}{|l|}{\tt magic} & Magic string & {\tt char[4]} & TBI$\backslash$1 \\
\hline
\multicolumn{4}{|l|}{\tt n\_ref} & \# sequences & {\tt int32\_t} & \\
\hline
\multicolumn{4}{|l|}{\tt format} & Format (0: generic; 1: SAM; 2: VCF) & {\tt int32\_t} & \\
\hline
\multicolumn{4}{|l|}{\tt col\_seq} & Column for the sequence name & {\tt int32\_t} & \\
\hline
\multicolumn{4}{|l|}{\tt col\_beg} & Column for the start of a region & {\tt int32\_t} & \\
\hline
\multicolumn{4}{|l|}{\tt col\_end} & Column for the end of a region & {\tt int32\_t} & \\
\hline
\multicolumn{4}{|l|}{\tt meta} & Leading character for comment lines & {\tt int32\_t} & \\
\hline
\multicolumn{4}{|l|}{\tt skip} & \# lines to skip at the beginning & {\tt int32\_t} & \\
\hline
\multicolumn{4}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\
\hline
\multicolumn{4}{|l|}{\tt names} & Concatenated names, each zero terminated & {\tt char[l\_nm]} & \\
\hline
\multicolumn{7}{|c|}{\textcolor{gray}{\it List of indices (n=n\_ref)}}\\
\cline{2-7}
\hspace{0.1cm} & \multicolumn{3}{l|}{\tt n\_bin} & \# distinct bins (for the binning index) & {\tt int32\_t} & \\
\cline{2-7}
& \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct bins (n=n\_bin)}} \\
\cline{3-7}
& \hspace{0.1cm} & \multicolumn{2}{l|}{\tt bin} & Distinct bin number & {\tt uint32\_t} & \\
\cline{3-7}
& & \multicolumn{2}{l|}{\tt n\_chunk} & \# chunks & {\tt int32\_t} & \\
\cline{3-7}
& & \multicolumn{5}{c|}{\textcolor{gray}{\it List of chunks (n=n\_chunk)}} \\
\cline{4-7}
& & \hspace{0.1cm} & {\tt cnk\_beg} & Virtual file offset of the start of the chunk & {\tt uint64\_t} & \\
\cline{4-7}
& & & {\tt cnk\_end} & Virtual file offset of the end of the chunk & {\tt uint64\_t} & \\
\cline{2-7}
& \multicolumn{3}{l|}{\tt n\_intv} & \# 16kb intervals (for the linear index) & {\tt int32\_t} & \\
\cline{2-7}
& \multicolumn{6}{c|}{\textcolor{gray}{\it List of distinct intervals (n=n\_intv)}} \\
\cline{3-7}
& & \multicolumn{2}{l|}{\tt ioff} & File offset of the first record in the interval & {\tt uint64\_t} & \\
\hline
\end{tabular}
\end{center}
{\bf Notes:}
\begin{itemize}
\item The index file is BGZF compressed.
\item All integers are little-endian.
\item When {\tt (format\&0x10000)} is true, the coordinate follows the
{\tt BED} rule (i.e. half-closed-half-open and zero based); otherwise,
the coordinate follows the {\tt GFF} rule (closed and one based).
\item For the SAM format, the end of a region equals {\tt POS} plus the
reference length in the alignment, inferred from {\tt CIGAR}. For the
VCF format, the end of a region equals {\tt POS} plus the size of the
deletion.
\item Field {\tt col\_beg} may equal {\tt col\_end}, and in this case,
the end of a region is {\tt end}={\tt beg+1}.
\item Example. For {\tt GFF}, {\tt format}=0, {\tt col\_seq}=1, {\tt
col\_beg}=4, {\tt col\_end}=5, {\tt meta}=`{\tt \#}' and {\tt
skip}=0. For {\tt BED}, {\tt format}=0x10000, {\tt col\_seq}=1, {\tt
col\_beg}=2, {\tt col\_end}=3, {\tt meta}=`{\tt \#}' and {\tt
skip}=0.
\item Given a zero-based, half-closed and half-open region {\tt
[beg,end)}, the {\tt bin} number is calculated with the following C
function:
\begin{verbatim}
int reg2bin(int beg, int end) {
--end;
if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
return 0;
}
\end{verbatim}
\item The list of bins that may overlap a region {\tt [beg,end)} can be
obtained with the following C function.
\begin{verbatim}
#define MAX_BIN (((1<<18)-1)/7)
int reg2bins(int rbeg, int rend, uint16_t list[MAX_BIN])
{
int i = 0, k;
--rend;
list[i++] = 0;
for (k = 1 + (rbeg>>26); k <= 1 + (rend>>26); ++k) list[i++] = k;
for (k = 9 + (rbeg>>23); k <= 9 + (rend>>23); ++k) list[i++] = k;
for (k = 73 + (rbeg>>20); k <= 73 + (rend>>20); ++k) list[i++] = k;
for (k = 585 + (rbeg>>17); k <= 585 + (rend>>17); ++k) list[i++] = k;
for (k = 4681 + (rbeg>>14); k <= 4681 + (rend>>14); ++k) list[i++] = k;
return i; // #elements in list[]
}
\end{verbatim}
\end{itemize}
\end{document}