From 51ee40e144a607c986f207dc36f2ae1d20ffe8f3 Mon Sep 17 00:00:00 2001 From: ppillot Date: Tue, 2 Jan 2024 19:38:08 -0500 Subject: [PATCH 1/2] Do not mangle API related properties --- rollup.config.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rollup.config.mjs b/rollup.config.mjs index 68516a7..bb56f4c 100644 --- a/rollup.config.mjs +++ b/rollup.config.mjs @@ -29,7 +29,7 @@ export default { compress: {ecma: 2015, passes: 3, unsafe: true}, mangle: { properties: { - reserved: ['biomsa'] + reserved: ['biomsa', 'gapopen', 'gapextend', 'matrix', 'method', 'type', 'gapchar', 'debug'] } }, nameCache: {} From b3d2b92193cb3b0080a9dff101548c44990b84e1 Mon Sep 17 00:00:00 2001 From: ppillot Date: Fri, 5 Jan 2024 22:13:25 -0500 Subject: [PATCH 2/2] use Simple Matching Distance for global similarity The rationale is that when unrelated sequences are compared in a set of related sequences, if the sizes are different, longer sequences can have converging Kmers. The SMD accounts for the absence of missing kmers, which counterbalances random matches. --- src/sequence/sequence.ts | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/sequence/sequence.ts b/src/sequence/sequence.ts index 10481ec..c7159c0 100644 --- a/src/sequence/sequence.ts +++ b/src/sequence/sequence.ts @@ -261,8 +261,10 @@ export function distanceMatrix(tabSeq: TSequence[]) { // Here, for reasons of computational speed, we compute binary matchings: each // kmer value is associated with a bit in a BitSet. Intersection size between // bitsets increases with sequence proximity. - // The distance is computed as a Tanimoto distance between the BitSets of 2 - // sequences. + // The distance is computed as a Simple Matching Distance between the + // BitSets of 2 sequences. Compared to a Tanimoto distance, it also counts + // non-set bits (0) as matches which disfavours comparisons between sequences + // of various sizes. // When comparing several sequences between each other, if the sequences have // a noticeable variety of sizes, longer sequences will tend to have more // matches than shorter ones. @@ -277,7 +279,7 @@ export function distanceMatrix(tabSeq: TSequence[]) { const l = tabSeq.length; const distTab: number[][] = tabSeq.map(() => []); let lKmerI: BitArray; - let lDistance: number; + let distance: number; let kbitsICount: number; let kbitsJCount: number; let commonKbitsCount: number; @@ -291,25 +293,24 @@ export function distanceMatrix(tabSeq: TSequence[]) { backgroundMatchingProbability = computeBackgroundKmerMatch(kbitsICount, bitsetLength) for (let j = i + 1; j < l; j++) { - commonKbitsCount = lKmerI.getIntersectionSize(lKmer[j]) kbitsJCount = lKmer[j].getSize() + commonKbitsCount = lKmerI.getIntersectionSize(lKmer[j]) + commonKbitsCount += bitsetLength - (kbitsICount + kbitsJCount - commonKbitsCount) expectedRandomMatches = Math.ceil(backgroundMatchingProbability * kbitsJCount); commonKbitsCount -= expectedRandomMatches; + commonKbitsCount = Math.max(commonKbitsCount, 0) // Tanimoto/Jacquard distance corrected for random matches - lDistance = 1 - ( - commonKbitsCount / ( - kbitsICount - expectedRandomMatches - + kbitsJCount - expectedRandomMatches - - commonKbitsCount - ) - ); - distTab[j][i] = distTab[i][j] = Math.max(lDistance, 0); + distance = 1 - (commonKbitsCount / bitsetLength); + distTab[j][i] = distTab[i][j] = distance; } } - if (DEBUG) Log.add('K-mer distance computation'); + if (DEBUG) { + Log.add('K-mer distance computation'); + console.table(distTab) + } return distTab; }