Skip to content

Commit

Permalink
Merge pull request #14 from hasindu2008/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
hasindu2008 authored Feb 19, 2024
2 parents 272c0e3 + 877f6aa commit 1b17167
Show file tree
Hide file tree
Showing 41 changed files with 527,684 additions and 263,391 deletions.
27 changes: 24 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ OBJ = $(BUILD_DIR)/main.o \
$(BUILD_DIR)/misc.o \
$(BUILD_DIR)/sim.o \
$(BUILD_DIR)/thread.o \
$(BUILD_DIR)/format.o \
$(BUILD_DIR)/gensig.o \
$(BUILD_DIR)/genread.o \
$(BUILD_DIR)/ref.o \

PREFIX = /usr/local
VERSION = `git describe --tags`
Expand All @@ -29,7 +33,11 @@ endif
$(BINARY): $(OBJ) slow5lib/lib/libslow5.a
$(CC) $(CFLAGS) $(OBJ) slow5lib/lib/libslow5.a $(LDFLAGS) -o $@

$(BUILD_DIR)/main.o: src/main.c src/misc.h src/error.h src/sq.h
HEADERS = src/error.h src/format.h src/misc.h src/model.h \
src/rand.h src/ref.h src/seq.h src/sq.h src/str.h src/version.h \
src/kseq.h src/khash.h src/ksort.h

$(BUILD_DIR)/main.o: src/main.c $(HEADERS)
$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@

$(BUILD_DIR)/model.o: src/model.c src/model.h src/misc.h
Expand All @@ -38,12 +46,25 @@ $(BUILD_DIR)/model.o: src/model.c src/model.h src/misc.h
$(BUILD_DIR)/thread.o: src/thread.c
$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@

$(BUILD_DIR)/misc.o: src/misc.c
$(BUILD_DIR)/misc.o: src/misc.c src/error.h
$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@

$(BUILD_DIR)/sim.o: src/sim.c $(HEADERS)
$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@

$(BUILD_DIR)/format.o: src/format.c $(HEADERS)
$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@

$(BUILD_DIR)/sim.o: src/sim.c src/ref.h src/misc.h src/str.h
$(BUILD_DIR)/gensig.o: src/gensig.c $(HEADERS)
$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@

$(BUILD_DIR)/genread.o: src/genread.c $(HEADERS)
$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@

$(BUILD_DIR)/ref.o: src/ref.c $(HEADERS)
$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@


slow5lib/lib/libslow5.a:
$(MAKE) -C slow5lib zstd=$(zstd) no_simd=$(no_simd) zstd_local=$(zstd_local) lib/libslow5.a

Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ After getting the basic *ssssim* implemented in ~8 hours and successfully baseca
For x86-64 Linux, you can use the precompiled binaries under [releases](https://github.com/hasindu2008/squigulator/releases):

```
VERSION=0.2.2
VERSION=0.2.2-dirty
wget https://github.com/hasindu2008/squigulator/releases/download/v${VERSION}/squigulator-v${VERSION}-x86_64-linux-binaries.tar.gz
tar xf squigulator-v${VERSION}-x86_64-linux-binaries.tar.gz && cd squigulator-v${VERSION}
./squigulator --help
Expand Down Expand Up @@ -78,6 +78,8 @@ By default, DNA PromethION reads (R9.4.1) will be simulated. Specify the `-x STR
- `rna-r9-prom`: direct RNA on PromethION R9.4.1 flowcells
- `dna-r10-min`: genomic DNA on MinION R10.4.1 flowcells
- `dna-r10-prom`: genomic DNA on PromethION R10.4.1 flowcells
- `rna004-min`: direct RNA on MinION RNA004 flowcells
- `rna004-prom`: direct RNA on promethION RNA004 flowcells

If a genomic DNA profile is selected, the input reference must be the **reference genome in *FASTA* format**. *squigulator* will randomly sample the genome from a uniform distribution and generate reads whose lengths are from a gamma distribution (based on `-r`). If a direct RNA profile is selected, the input reference must be the **transcriptome in *FASTA* format**. For RNA, *squigulator* will randomly pick transcripts from a uniform distribution and the whole transcript length is simulated.

Expand Down
8 changes: 6 additions & 2 deletions docs/man.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Basic options in *squigulator* are as below:

- `-o FILE`: SLOW5/BLOW5 file to write.
- `-x STR`: Parameter profile (always applied before other options). Available profiles are: *dna-r9-min*, *dna-r9-prom, rna-r9-min*, *rna-r9-prom*, *dna-r10-min*, *dna-r10-prom*. [default: dna-r9-prom]
- `-x STR`: Parameter profile (always applied before other options). Available profiles are: *dna-r9-min*, *dna-r9-prom, rna-r9-min*, *rna-r9-prom*, *dna-r10-min*, *dna-r10-prom*, *rna004-min, *rna004-prom* [default: dna-r9-prom]
- `-n INT`: Number of reads to simulate. [default: 4000]
- `-r INT `: Mean read length (estimated mean only, unused for RNA) [default: 10000]
- `-f INT`: fold coverage to simulate (incompatible with -n)
Expand All @@ -29,6 +29,9 @@ Advanced options are as below:
- `--prefix=yes/no`: generate prefixes such as adaptor (and polya for RNA). [default: no]
- `--seed INT`: seed for random generators (if 0, will be autogenerated). Giving the same seed will produce same results. [default: 0]
- `--paf-ref`: in paf output, use the reference as the target instead of read (needs -c)
- `--cdna`: generate cDNA reads (only valid with dna profiles and the reference must a transcriptome, experimental)
- `--trans-count FILE`: simulate relative abundance using specified 2-column tsv with first column containing transcript name and the second containing the count (only for direct-rna and cDNA, experimental)
- `--trans-trunc=yes/no`: simulate transcript truncation (only for direct-rna and cDNA, experimental) [default: no]

Developer options (which are not much tested and error handling) are as below:

Expand All @@ -37,4 +40,5 @@ Developer options (which are not much tested and error handling) are as below:
- `--range FLOAT`: ADC range (see [here](https://hasindu2008.github.io/slow5specs/summary))
- `--offset-mean FLOAT`: ADC offset mean (see [here](https://hasindu2008.github.io/slow5specs/summary))
- `--offset-std FLOAT`: ADC offset standard deviation (see [here](https://hasindu2008.github.io/slow5specs/summary))

- `--median-before-mean`: Median before mean (see [here](https://hasindu2008.github.io/slow5specs/summary))
- `--median-before-std`: Median before standard deviation (see [here](https://hasindu2008.github.io/slow5specs/summary))
110 changes: 110 additions & 0 deletions docs/profile.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Parameter profiles

Inspired by parameter presets in Minimap2, Squigulator has parameter profiles/presets that can be used via the `-x option`. This provides the user with a convenient way to simulate different chemistries and flowcells, without having to use advanced parameters.

Parameter profiles are currently available for R9.4.1, R10.4.1 amd RNA004. More profiles will be added in future when new nanopore chemistries are introduced.

## R9 profiles

Following profiles are available for R9.4.1.

- `dna-r9-min`: DNA R9.4.1 MinION (or GridION)
- `dna-r9-prom`: DNA R9.4.1 promethION (or P2solo)
- `rna-r9-min`: RNA R9.4.1 MinION (or GridION)
- `rna-r9-prom`: RNA R9.4.1 promethION (or P2solo)

These profile sets the following advanced parameters in squigulator.

| Parameter | dna-r9-min | dna-r9-prom | rna-r9-min | rna-r9-prom |
|----------------------|--------------|---------------|--------------|---------------|
| digitisation | 8192 | 2048 | 8192 | 2048 |
| sample-rate | 4000 | 4000 | 3012 | 3000 |
| bps | 450 | 450 | 70 | 70 |
| range | 1443.030273 | 748.5801 | 1126.47 | 548.788269 |
| offset-mean | 13.7222605 | -237.4102 | 4.65491888 | -231.9440589 |
| Offset-std | 10.25279688 | 14.1575 | 4.115262472 | 12.87185278 |
| median-before-mean | 200.815801 | 214.2890337 | 242.6584118 | 238.5286796 |
| median-before-std | 20.48933762 | 18.0127916 | 10.60230888 | 21.1871794 |
| dwell-mean | 9.0 | 9.0 | 43.0 | 43.0 |
| dwell-std | 4.0 | 4.0 | 35.0 | 35.0 |


## R10 and RNA004 profiles

Following profiles are available for R10.4.1 and RNA004.

- `dna-r10-min`: DNA R10.4.1 MinION (or GridION)
- `dna-r10-prom`: DNA R10.4.1 promethION (or P2solo)
- `rna004-min`: RNA004 MinION (or GridION)
- `rna004-prom`: RNA004 promethION (or P2solo)

These profile sets the following advanced parameters in squigulator.


| Parameter | dna-r10-min | dna-r10-prom | rna004-min | rna004-prom |
|---------------------|---------------|-----------------|--------------|---------------|
| digitisation | 8192 | 2048 | 2048 | 2048 |
| sample-rate | 4000 | 4000 | 4000 | 4000 |
| bps | 400 | 400 | 130 | 130 |
| range | 1536.598389 | 281.345551 | TBD | 299.432068 |
| offset-mean | 13.380569389 | -127.5655735 | TBD5 | -259.421128 |
| offset-std | 16.311471649 | 19.377283387665 | TBD | 16.010841823643 |
| median-before-mean | 202.154074388 | 189.87607393756 | TBD | 189.87607393756 |
| median-before-std | 13.406139242 | 15.788097978713 | TBD | 15.788097978713 |
| dwell-mean | 10.0 | 10.0 | TBD | 31.0 |
| dwell-std | 4.0 | 4.0 | TBD | 4.0 |

## Determining parameters for a profile

This section briefly describes how these parameter profiles are created for a given flow-cell chemistry. Following requirements and assumptions must be met.

- A high quality pore model must be available. ONT usually provides k-mer models for their chemistries [here](https://github.com/nanoporetech/kmer_models). For R9.4.1 both the level means and the standard deviation values were provided, however, for R10.4.1 and RNA004 only level means have been provided. The method described [here](https://hasindu2008.github.io/f5c/docs/r10train) can be used for deducing the standard deviation values.

- You need some sample signal data in BLOW5 format and [slow5tools](https://github.com/hasindu2008/slow5tools) set up. If the data is in FAST5, use [slow5tools f2s](https://github.com/hasindu2008/slow5tools) to convert. If data is in POD5, use [blue-crab](https://github.com/Psy-Fer/blue-crab).

- Install `datamash` (e.g., apt-get install datamash)


Now let us see how the the parameters are deduced.

- `digitisation`. This is the digitisation field in the BLOW5 file as explained [here](https://hasindu2008.github.io/slow5specs/summary). The digitisation value must be the same across the whole dataset. Infact, so far as far as we know, MinIONs/GridIONs always has a digitisation of 8192 and promethION/P2 has 2048. You can deduce the digitisation using the dataset by using the example command below (you should only see one value):

```
slow5tools skim -t40 reads_500k.blow5 | cut -f 3 | tail -n+2 | sort -u
2048
```

- `sample-rate`. This is the sample-rate field in the BLOW5 file as explained [here](https://hasindu2008.github.io/slow5specs/summary). This value must be the same across the whole dataset. To deduce this value, you can use the following example command (you should only see one value):

```
slow5tools skim -t40 reads_500k.blow5 | cut -f 6 | tail -n+2 | sort -u
4000
```

- `bps`. This is the translocation speed which can be found on the relevant Guppy/Dorado basecalling model. For example, the Dorado model for rna004 is `rna004_130bps_sup@v3.0.1`. The bps is 130.

- `range`. This is the range field in the BLOW5 file as explained [here](https://hasindu2008.github.io/slow5specs/summary). This value also must be the same across the whole dataset. Just as before, example command (you should only see one value):

```
slow5tools skim -t40 reads_500k.blow5 | cut -f 5 | tail -n+2 | sort -u
299.432068
```

- `offset-mean` and `offset-std`. This is the mean and standard deviation of the [offset field in the BLOW5 file](https://hasindu2008.github.io/slow5specs/summary). Example command to get the two parameters using datamash:

```
slow5tools skim -t40 500k.blow5 | cut -f 4 | tail -n+2 | datamash mean 1 sstdev 1
-259.421128 16.010841823643
```

- `median-before-mean` and `median-before-std`. This is the mean and standard deviation of the [median_before field in the BLOW5 file](https://hasindu2008.github.io/slow5specs/summary). Example command to get the two parameters:

```
slow5tools skim -t40 reads_500k.blow5 | awk -v c="median_before" 'NR==1{for (i=1; i<=NF; i++) if ($i==c){p=i; break};} {if ($p!=".") print $p}' | tail -n+2 | datamash mean 1 sstdev 1
205.63935594369 8.3994882799157
```

- `dwell-mean`. This is the mean of the number of signal samples per base. This must be equal to the `sample_rate`/`bps`, and is just used as a sanity check.

- `dwell-std`. This is the standard deviation of number of signal samples per base. Setting this to 0 gives the highest basecalling accuracy. Increasing the value will reduce the accuracy.
The best value for this parameter can be empirically determined by basecalling the data simulated with different values for this parameter and choosing the value that gives the closest accuracy to basecalls of the real data.
17 changes: 17 additions & 0 deletions scripts/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,21 @@ ex ./squigulator -x dna-r10-prom -o a.slow5 -n 2 --seed 2 --dwell-std 4.0 -t1 te
diff -q test/dna_r10_paf-ref.exp a.slow5 || die "diff failed"
diff -q test/dna_r10_paf-ref.sam.exp a.sam || die "diff failed"


redundancy_check () {
N=$(grep -v ^[@#] a.slow5 | cut -f ${1} | sort | uniq -c | sort -nr -k1,1 | head -1 | awk '{print $1}')
[ "$N" != "1" ] && die "failed thread test for column ${1}"
}
ex ./squigulator -x dna-r9-min test/nCoV-2019.reference.fasta -n 100 -t 8 -K 10 -o a.slow5 --seed 1
# read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal
# 9channel_number 10median_before 11read_number 12start_mux 13start_time
redundancy_check 1
redundancy_check 4
redundancy_check 7
redundancy_check 8
redundancy_check 10
redundancy_check 11
redundancy_check 13
echo "Test passed"
10 changes: 10 additions & 0 deletions slow5lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ else
CFLAGS += -DSLOW5_USE_ZSTD
CPPFLAGS += -I $(zstd_local)
endif
ifeq ($(slow5_mt),1)
CFLAGS += -DSLOW5_ENABLE_MT
LDFLAGS += -lpthread
endif

BUILD_DIR = lib

STATICLIB = $(BUILD_DIR)/libslow5.a
Expand All @@ -28,6 +33,7 @@ OBJ = $(BUILD_DIR)/slow5.o \
$(BUILD_DIR)/slow5_idx.o \
$(BUILD_DIR)/slow5_misc.o \
$(BUILD_DIR)/slow5_press.o \
$(BUILD_DIR)/slow5_mt.o \

PREFIX = /usr/local
VERSION = `git describe --tags`
Expand Down Expand Up @@ -61,6 +67,9 @@ $(BUILD_DIR)/slow5_misc.o: src/slow5_misc.c src/slow5_misc.h include/slow5/slow5
$(BUILD_DIR)/slow5_press.o: src/slow5_press.c include/slow5/slow5_press.h src/slow5_misc.h include/slow5/slow5_error.h
$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -fpic -o $@

$(BUILD_DIR)/slow5_mt.o: src/slow5_mt.c include/slow5/slow5_mt.h $(SLOW5_H)
$(CC) $(CFLAGS) $(CPPFLAGS) $< -c -fpic -o $@

clean:
rm -rf $(OBJ) $(STATICLIB) $(SHAREDLIB) $(SHAREDLIBV)
make -C $(SVB) clean
Expand All @@ -81,6 +90,7 @@ pyslow5:
python3 setup.py build
cp build/lib.*/*.so ./
python3 < python/example.py
rm -rf .eggs/
python3 setup.py sdist


Expand Down
Loading

0 comments on commit 1b17167

Please sign in to comment.