Skip to content

Commit

Permalink
Merge branch 'dev' into meth
Browse files Browse the repository at this point in the history
  • Loading branch information
hasindu2008 committed Jul 17, 2024
2 parents 6e83b04 + 7318deb commit be9e20d
Show file tree
Hide file tree
Showing 16 changed files with 88 additions and 28 deletions.
1 change: 1 addition & 0 deletions docs/man.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Advanced options are as below:
- `--cdna`: generate cDNA reads (only valid with dna profiles and the reference must a transcriptome, experimental)
- `--trans-count FILE`: simulate relative abundance using specified 2-column tsv with first column containing transcript name and the second containing the count (only for direct-rna and cDNA, experimental). You may generate this a test fatq dataset using minimap2, for example, `minimap2 -cx map-ont transcripts.fa reads.fastq --secondary=no -t20 -uf | cut -f 6 | sort | uniq -c | awk '{print$2"\t"$1}'`.
- `--trans-trunc=yes/no`: simulate transcript truncation (only for direct-rna, experimental) [default: no]
- `--ont-friendly=yes/no`: generate fake uuid for readids and add a dummy end_reason [default: no]

Developer options (which are not much tested and error handling) are as below:

Expand Down
16 changes: 8 additions & 8 deletions docs/profile.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,16 @@ These profile sets the following advanced parameters in squigulator.

| Parameter | dna-r10-min | dna-r10-prom | rna004-min | rna004-prom |
|---------------------|---------------|-----------------|--------------|---------------|
| digitisation | 8192 | 2048 | 2048 | 2048 |
| digitisation | 8192 | 2048 | 8192 | 2048 |
| sample-rate | 4000 | 4000 | 4000 | 4000 |
| bps | 400 | 400 | 130 | 130 |
| range | 1536.598389 | 281.345551 | TBD | 299.432068 |
| offset-mean | 13.380569389 | -127.5655735 | TBD5 | -259.421128 |
| offset-std | 16.311471649 | 19.377283387665 | TBD | 16.010841823643 |
| median-before-mean | 202.154074388 | 189.87607393756 | TBD | 189.87607393756 |
| median-before-std | 13.406139242 | 15.788097978713 | TBD | 15.788097978713 |
| dwell-mean | 10.0 | 10.0 | TBD | 31.0 |
| dwell-std | 4.0 | 4.0 | TBD | 4.0 |
| range | 1536.598389 | 281.345551 | 1437.976685 | 299.432068 |
| offset-mean | 13.380569389 | -127.5655735 | 12.47686423863 | -259.421128 |
| offset-std | 16.311471649 | 19.377283387665 | 10.442126577137 | 16.010841823643 |
| median-before-mean | 202.154074388 | 189.87607393756 | 205.08496731088 | 189.87607393756 |
| median-before-std | 13.406139242 | 15.788097978713 | 8.6671292866233 | 15.788097978713 |
| dwell-mean | 10.0 | 10.0 | 31.0 | 31.0 |
| dwell-std | 4.0 | 4.0 | 0.0 | 0.0 |

## Determining parameters for a profile

Expand Down
40 changes: 37 additions & 3 deletions src/gensig.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ char *attach_prefix(core_t *core, const char *read, int32_t *len, aln_t *aln);
void gen_prefix_dna(int16_t *raw_signal, int64_t* n, int64_t *c, profile_t *profile, double offset);
int16_t *gen_prefix_rna(core_t *core, int16_t *raw_signal, int64_t* n, int64_t *c, double offset, int tid, aln_t *aln);

void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10){
void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10, double sample_frequency){

slow5_hdr_t *header=sp->header;

Expand Down Expand Up @@ -71,6 +71,11 @@ void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10){
ERROR("%s","Error adding sequencing_kit attribute");
exit(EXIT_FAILURE);
}
//add another header group attribute called sample_frequency
if (slow5_hdr_add("sample_frequency", header) < 0){
ERROR("%s","Error adding sample_frequency attribute");
exit(EXIT_FAILURE);
}

//set the run_id attribute to "run_0" for read group 0
if (slow5_hdr_set("run_id", "run_0", 0, header) < 0){
Expand Down Expand Up @@ -109,9 +114,21 @@ void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10){
ERROR("%s","Error setting sequencing_kit attribute in read group 0");
exit(EXIT_FAILURE);
}
//sample_frequency
if(sample_frequency<=0 || sample_frequency>1000000000){
ERROR("%s","A weird sample frequency. It should be between 0 and 1000000000 Hz");
exit(EXIT_FAILURE);
}
char buffer[100];
sprintf(buffer, "%d", (int)sample_frequency);
if (slow5_hdr_set("sample_frequency", buffer, 0, header) < 0){
ERROR("%s","Error setting sample_frequency attribute in read group 0");
exit(EXIT_FAILURE);
}

}

void set_header_aux_fields(slow5_file_t *sp){
void set_header_aux_fields(slow5_file_t *sp, int8_t ont_friendly){

//add auxilliary field: channel number
if (slow5_aux_add("channel_number", SLOW5_STRING, sp->header) < 0){
Expand Down Expand Up @@ -140,6 +157,15 @@ void set_header_aux_fields(slow5_file_t *sp){
ERROR("%s","Error adding start_time auxilliary field\n");
exit(EXIT_FAILURE);
}
//add auxilliary field: end_reason
if(ont_friendly){
const char *enum_labels[]={"unknown","partial","mux_change","unblock_mux_change","data_service_unblock_mux_change","signal_positive","signal_negative"};
uint8_t num_labels = 7;
if (slow5_aux_add_enum("end_reason", enum_labels, num_labels, sp->header) < 0){
ERROR("%s","Error adding end_reason auxilliary field\n");
exit(EXIT_FAILURE);
}
}
}

void set_record_primary_fields(profile_t *profile, slow5_rec_t *slow5_record, char *read_id, double offset, int64_t len_raw_signal, int16_t *raw_signal){
Expand All @@ -156,7 +182,7 @@ void set_record_primary_fields(profile_t *profile, slow5_rec_t *slow5_record, ch

}

void set_record_aux_fields(slow5_rec_t *slow5_record, slow5_file_t *sp, double median_before, int32_t read_number, uint64_t start_time){
void set_record_aux_fields(slow5_rec_t *slow5_record, slow5_file_t *sp, double median_before, int32_t read_number, uint64_t start_time, int8_t ont_friendly){

const char *channel_number = "0";
uint8_t start_mux = 0;
Expand Down Expand Up @@ -186,6 +212,14 @@ void set_record_aux_fields(slow5_rec_t *slow5_record, slow5_file_t *sp, double m
exit(EXIT_FAILURE);
}

if(ont_friendly){
uint8_t end_reason = 0;
if(slow5_aux_set(slow5_record, "end_reason", &end_reason, sp->header) < 0){
ERROR("%s","Error setting end_reason auxilliary field\n");
exit(EXIT_FAILURE);
}
}

}


Expand Down
47 changes: 30 additions & 17 deletions src/sim.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ SOFTWARE.
#include "error.h"
#include "misc.h"

void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10);
void set_header_aux_fields(slow5_file_t *sp);
void set_header_attributes(slow5_file_t *sp, int8_t rna, int8_t r10, double sample_frequency);
void set_header_aux_fields(slow5_file_t *sp, int8_t ont_friendly);
void set_record_primary_fields(profile_t *profile, slow5_rec_t *slow5_record, char *read_id, double offset, int64_t len_raw_signal, int16_t *raw_signal);
void set_record_aux_fields(slow5_rec_t *slow5_record, slow5_file_t *sp, double median_before, int32_t read_number, uint64_t start_time);
void set_record_aux_fields(slow5_rec_t *slow5_record, slow5_file_t *sp, double median_before, int32_t read_number, uint64_t start_time, int8_t ont_friendly);
uint32_t read_model(model_t* model, const char* file, uint32_t type);
uint32_t set_model(model_t* model, uint32_t model_id);

Expand Down Expand Up @@ -140,11 +140,11 @@ profile_t minion_rna004_rna_prof = {
.digitisation = 8192,
.sample_rate = 4000,
.bps = 130,
.range = 1536.598389,
.offset_mean=13.380569389019,
.offset_std=16.311471649012,
.median_before_mean=202.15407438804,
.median_before_std=13.406139241768,
.range = 1437.976685,
.offset_mean=12.47686423863,
.offset_std=10.442126577137,
.median_before_mean=205.08496731088,
.median_before_std=8.6671292866233,
.dwell_mean=31.0, //this must be sample_rate/bps for now
.dwell_std=0.0
};
Expand Down Expand Up @@ -172,8 +172,7 @@ static inline profile_t set_profile(char *prof_name, opt_t *opt){
opt->flag |= SQ_R10;
return minion_r10_dna_prof;
}else if(strcmp(prof_name, "rna004-min") == 0){
ERROR("%s","Parameters not determined for rna004 MinION. Please share some data!");
exit(EXIT_FAILURE);
WARNING("%s","Parameters and models for rna004-min are still crude. If you have good IVT data, please share!");
opt->flag |= SQ_R10;
opt->flag |= SQ_RNA;
return minion_rna004_rna_prof;
Expand Down Expand Up @@ -345,8 +344,8 @@ static core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_fil
exit(EXIT_FAILURE);
}

set_header_attributes(core->sp, opt.flag & SQ_RNA ? 1 : 0, opt.flag & SQ_R10 ? 1 : 0);
set_header_aux_fields(core->sp);
set_header_attributes(core->sp, opt.flag & SQ_RNA ? 1 : 0, opt.flag & SQ_R10 ? 1 : 0, p.sample_rate);
set_header_aux_fields(core->sp, opt.flag & SQ_ONT ? 1 : 0);

if(slow5_hdr_write(core->sp) < 0){
ERROR("%s","Error writing header!\n");
Expand Down Expand Up @@ -494,7 +493,13 @@ void free_db(db_t* db) {
free(db);
}


void fake_uuid(char *read_id, int64_t num){
if(num>999999999999){
ERROR("Too many reads that my lazy fake uuid generator cannot handle %ld is too large. Open an issue and I will fix this",num);
exit(EXIT_FAILURE);
}
sprintf(read_id,"00000000-0000-0000-0000-%012d",(int)num);
}


//void gen_prefix_dna(int16_t *raw_signal, int64_t* n, int64_t *c, profile_t *profile, double offset);
Expand Down Expand Up @@ -556,7 +561,11 @@ void work_per_single_read(core_t* core,db_t* db, int32_t i, int tid) {

char *read_id= (char *)malloc(sizeof(char)*(10000));
MALLOC_CHK(read_id);
sprintf(read_id,"S1_%ld!%s!%d!%d!%c",core->total_reads+i+1, rid, ref_pos_st, ref_pos_end, strand);
if(opt.flag & SQ_ONT){
fake_uuid(read_id, core->total_reads+i+1);
} else {
sprintf(read_id,"S1_%ld!%s!%d!%d!%c",core->total_reads+i+1, rid, ref_pos_st, ref_pos_end, strand);
}
if(core->fp_fasta){
db->fasta[i] = (char *)malloc(sizeof(char)*(strlen(read_id)+strlen(seq)+10)); //+10 bit inefficent - for now
MALLOC_CHK(db->fasta[i]);
Expand Down Expand Up @@ -590,7 +599,7 @@ void work_per_single_read(core_t* core,db_t* db, int32_t i, int tid) {

int64_t n_samples = __sync_fetch_and_add(&core->n_samples, len_raw_signal);
set_record_primary_fields(&core->profile, slow5_record, read_id, offset, len_raw_signal, raw_signal);
set_record_aux_fields(slow5_record, sp, median_before, core->total_reads+i, n_samples);
set_record_aux_fields(slow5_record, sp, median_before, core->total_reads+i, n_samples, opt.flag & SQ_ONT ? 1 : 0);

//encode to a buffer
if (slow5_encode(&db->mem_records[i], &db->mem_bytes[i], slow5_record, sp) < 0){
Expand Down Expand Up @@ -686,7 +695,8 @@ static struct option long_options[] = {
{"trans-count", required_argument, 0, 0 }, //32 transcript count
{"trans-trunc", required_argument, 0, 0 }, //33 transcript truncate
{"cdna", no_argument, 0, 0 }, //34 cdna
{"meth-freq", required_argument, 0, 0 }, //35 meth-freq
{"ont-friendly", required_argument, 0, 0}, //35 ont-friendly
{"meth-freq", required_argument, 0, 0 }, //36 meth-freq
{0, 0, 0, 0}};


Expand Down Expand Up @@ -739,6 +749,7 @@ static void print_help(FILE *fp_help, opt_t opt, profile_t p, int64_t nreads) {
fprintf(fp_help," --cdna generate cDNA reads (only valid with dna profiles and the reference must a transcriptome, experimental)\n");
fprintf(fp_help," --trans-count FILE simulate relative abundance using specified tsv with transcript name & count (only for direct-rna and cDNA, experimental)\n");
fprintf(fp_help," --trans-trunc=yes/no simulate transcript truncattion (only for direct-rna, experimental) [no]\n");
fprintf(fp_help," --ont-friendly=yes/no generate fake uuid for readids and add a dummy end_reason [no]\n");
fprintf(fp_help," --meth-freq FILE simulate CpG methylation using specified frequency file, tsv file with chr, 0-based pos and freq as the columns (only for DNA, experimental)\n");

fprintf(fp_help,"\ndeveloper options (not much tested yet):\n");
Expand Down Expand Up @@ -974,7 +985,9 @@ int sim_main(int argc, char* argv[], double realtime0) {
opt_gvn.cdna = 1;
opt.flag |= SQ_CDNA;
WARNING("%s","Option --cdna is experimental. Please report any issues.")
} else if (c == 0 && longindex == 35){ //transcript count
} else if (c == 0 && longindex == 35){ //ont-friendly
yes_or_no(&opt, SQ_ONT, longindex, optarg, 1);
} else if (c == 0 && longindex == 36){ //meth freq
opt.meth_freq = optarg;
WARNING("%s","Option --meth-freq is experimental. Please report any issues.")
} else if (c == '?'){
Expand Down
1 change: 1 addition & 0 deletions src/sq.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#define SQ_PAF_REF 0x080 //in paf output, use ref as target
#define SQ_TRANS_TRUNC 0x100 //trans-trunc
#define SQ_CDNA 0x200 //CDNA
#define SQ_ONT 0x400 //ont friendly

#define WORK_STEAL 1 //simple work stealing enabled or not (no work stealing mean no load balancing)
#define STEAL_THRESH 1 //stealing threshold
Expand Down
1 change: 1 addition & 0 deletions test/dna_full_contig.exp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_ideal_amp_slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_ideal_slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_ideal_time_slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_prefix_slow5.exp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_r10_paf-ref.exp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 5000
@sequencing_kit sqk-lsk114
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/dna_r10_paf.exp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 5000
@sequencing_kit sqk-lsk114
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/rna_paf.exp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type rna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 3000
@sequencing_kit sqk-rna002
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/rna_prefixno_slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type rna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 3000
@sequencing_kit sqk-rna002
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/rna_slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type rna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 3000
@sequencing_kit sqk-rna002
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down
1 change: 1 addition & 0 deletions test/slow5.exp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
@experiment_type genomic_dna
@flow_cell_id FAN00000
@run_id run_0
@sample_frequency 4000
@sequencing_kit sqk-lsk109
#char* uint32_t double double double double uint64_t int16_t* char* double int32_t uint8_t uint64_t
#read_id read_group digitisation offset range sampling_rate len_raw_signal raw_signal channel_number median_before read_number start_mux start_time
Expand Down

0 comments on commit be9e20d

Please sign in to comment.