diff --git a/src/main/java/edu/umd/marbl/mhap/main/AlignmentHashRun.java b/src/main/java/edu/umd/marbl/mhap/main/AlignmentHashRun.java index 105acfb..bef006a 100644 --- a/src/main/java/edu/umd/marbl/mhap/main/AlignmentHashRun.java +++ b/src/main/java/edu/umd/marbl/mhap/main/AlignmentHashRun.java @@ -87,7 +87,7 @@ public static void main(String[] args) throws Exception { data.enqueueFullFile(); //SimHashSearch hashSearch = new SimHashSearch(kmerSize, numWords); - SequenceSketchStreamer seqStreamer = new SequenceSketchStreamer(inFile, kmerSize, numWords, DEFAULT_SUB_SEQUENCE_SIZE, 12, null, null, 0); + SequenceSketchStreamer seqStreamer = new SequenceSketchStreamer(inFile, kmerSize, numWords, DEFAULT_SUB_SEQUENCE_SIZE, 12, null, null, true, 0); MinHashSearch hashSearch = new MinHashSearch(seqStreamer, numWords, DEFAULT_NUM_MIN_MATCHES, numThreads, true, 0, 800, DEFAULT_THRESHOLD); System.err.println("Processed "+seqStreamer.getNumberProcessed()+" sequences."); diff --git a/src/main/java/edu/umd/marbl/mhap/main/KmerStatSimulator.java b/src/main/java/edu/umd/marbl/mhap/main/KmerStatSimulator.java index 73335c8..3ff9b4c 100644 --- a/src/main/java/edu/umd/marbl/mhap/main/KmerStatSimulator.java +++ b/src/main/java/edu/umd/marbl/mhap/main/KmerStatSimulator.java @@ -186,8 +186,8 @@ public double compareKmers(String first, String second) { } public double compareMinHash(String first, String second) { - MinHash h1 = new MinHash(new Sequence(first, new SequenceId(1)), this.kmer, 1256, null, null); - MinHash h2 = new MinHash(new Sequence(second, new SequenceId(2)), this.kmer, 1256, null, null); + MinHash h1 = new MinHash(new Sequence(first, new SequenceId(1)), this.kmer, 1256, null, null, true); + MinHash h2 = new MinHash(new Sequence(second, new SequenceId(2)), this.kmer, 1256, null, null, true); return h1.jaccard(h2); } diff --git a/src/main/java/edu/umd/marbl/mhap/main/MhapMain.java b/src/main/java/edu/umd/marbl/mhap/main/MhapMain.java index 813b42f..3b63812 100644 --- a/src/main/java/edu/umd/marbl/mhap/main/MhapMain.java +++ b/src/main/java/edu/umd/marbl/mhap/main/MhapMain.java @@ -56,30 +56,19 @@ public final class MhapMain { private final double acceptScore; - private final HashSet filter; - private final String inFile; - private final int kmerSize; - private final double maxShift; - private final int minStoreLength; - private final boolean noSelf; - private final int numHashes; - private final int numMinMatches; - protected final int numThreads; - private final String processFile; - - private final int subSequenceSize; - + private final int subSequenceSize; private final String toFile; + private final boolean weighted; private final KmerCounts kmerCounter; @@ -121,6 +110,7 @@ public static void main(String[] args) throws Exception options.addOption("--max-shift", "[double], region size to the left and right of the estimated overlap, as derived from the median shift and sequence length, where a k-mer matches are still considered valid. Second stage filter only.", DEFAULT_MAX_SHIFT_PERCENT); options.addOption("--num-min-matches", "[int], minimum # min-mer that must be shared before computing second stage filter. Any sequences below that value are considered non-overlapping.", DEFAULT_NUM_MIN_MATCHES); options.addOption("--num-threads", "[int], number of threads to use for computation. Typically set to 2 x #cores.", DEFAULT_NUM_THREADS); + options.addOption("--weighted", "Perform weighted MinHashing.", false); options.addOption("--min-store-length", "[int], The minimum length of the read that is stored in the box. Used to filter out short reads from FASTA file.", DEFAULT_MIN_STORE_LENGTH); options.addOption("--no-self", "Do not compute the overlaps between sequences inside a box. Should be used when the to and from sequences are coming from different files.", false); options.addOption("--store-full-id", "Store full IDs as seen in FASTA file, rather than storing just the sequence position in the file. Some FASTA files have long IDS, slowing output of results. IDs not stored in compressed files.", false); @@ -289,6 +279,7 @@ public MhapMain(ParseOptions options) throws IOException this.minStoreLength = options.get("--min-store-length").getInteger(); this.maxShift = options.get("--max-shift").getDouble(); this.acceptScore = options.get("--threshold").getDouble(); + this.weighted = options.get("--weighted").getBoolean(); // read in the kmer filter set String filterFile = options.get("-f").getString(); @@ -299,7 +290,7 @@ public MhapMain(ParseOptions options) throws IOException System.err.println("Reading in filter file " + filterFile + "."); try { - this.filter = Utils.createKmerFilter(filterFile, options.get("--filter-threshold").getDouble(), this.kmerSize); + this.filter = Utils.createKmerFilter(filterFile, options.get("--filter-threshold").getDouble(), this.kmerSize, 0); } catch (Exception e) { @@ -343,14 +334,14 @@ public void run() while (seq != null) { //get the kmers integers - long[] kmerHashes = Utils.computeSequenceHashesLong(seq.getString(), MhapMain.this.kmerSize); + long[] kmerHashes = Utils.computeSequenceHashesLong(seq.getString(), MhapMain.this.kmerSize, 0); //store the values for (long val : kmerHashes) countMin.add(val); //get the kmers integers for reverse compliment - kmerHashes = Utils.computeSequenceHashesLong(seq.getReverseCompliment().getString(), MhapMain.this.kmerSize); + kmerHashes = Utils.computeSequenceHashesLong(seq.getReverseCompliment().getString(), MhapMain.this.kmerSize, 0); //store the values for (long val : kmerHashes) @@ -581,7 +572,7 @@ public SequenceSketchStreamer getSequenceHashStreamer(String file, int offset) t seqStreamer = new SequenceSketchStreamer(file, offset); else seqStreamer = new SequenceSketchStreamer(file, this.kmerSize, this.numHashes, this.subSequenceSize, - DEFAULT_ORDERED_KMER_SIZE, this.filter, this.kmerCounter, offset); + DEFAULT_ORDERED_KMER_SIZE, this.filter, this.kmerCounter, this.weighted, offset); return seqStreamer; } diff --git a/src/main/java/edu/umd/marbl/mhap/simhash/AbstractSequenceBitHash.java b/src/main/java/edu/umd/marbl/mhap/simhash/AbstractBitSketch.java similarity index 63% rename from src/main/java/edu/umd/marbl/mhap/simhash/AbstractSequenceBitHash.java rename to src/main/java/edu/umd/marbl/mhap/simhash/AbstractBitSketch.java index b32d8c8..38b4027 100644 --- a/src/main/java/edu/umd/marbl/mhap/simhash/AbstractSequenceBitHash.java +++ b/src/main/java/edu/umd/marbl/mhap/simhash/AbstractBitSketch.java @@ -31,17 +31,23 @@ import edu.umd.marbl.mhap.utils.MhapRuntimeException; -public abstract class AbstractSequenceBitHash> implements VectorHash, Comparable +public abstract class AbstractBitSketch> implements VectorHash, + Comparable { - protected long bits[]; - + protected final long[] bits; + + protected AbstractBitSketch(long[] bits) + { + this.bits = bits; + } + public final double adjScore(T sh) { - double score = jaccord(sh); - + double score = jaccard(sh); + return score; } - + public long[] getBits() { return this.bits; @@ -50,59 +56,61 @@ public long[] getBits() @Override public int compareTo(final T sim) { - for (int bitIndex=0; bitIndex this.bits[bitIndex]) + if (this.bits[bitIndex] > sim.bits[bitIndex]) return 1; } - + return 0; } public final int getIntersectionCount(final T sh) { - if (this.bits.length!=sh.bits.length) + if (this.bits.length != sh.bits.length) throw new MhapRuntimeException("Size of bits in tables must match."); - + int count = 0; - for (int longIndex=0; longIndex>> 1; - } - } + for (int longIndex = 0; longIndex < this.bits.length; longIndex++) + { + long mask = 1L << 63; + + for (int bit = 63; bit >= 0; bit--) + { + if ((this.bits[longIndex] & mask) == 0) + s.append("0"); + else + s.append("1"); + + mask = mask >>> 1; + } + } return s.toString(); } } \ No newline at end of file diff --git a/src/main/java/edu/umd/marbl/mhap/simhash/VectorHash.java b/src/main/java/edu/umd/marbl/mhap/simhash/VectorHash.java index 2496ddb..9014e04 100644 --- a/src/main/java/edu/umd/marbl/mhap/simhash/VectorHash.java +++ b/src/main/java/edu/umd/marbl/mhap/simhash/VectorHash.java @@ -31,5 +31,5 @@ public interface VectorHash { - double jaccord(T sh); + double jaccard(T sh); } diff --git a/src/main/java/edu/umd/marbl/mhap/sketch/MinHash.java b/src/main/java/edu/umd/marbl/mhap/sketch/MinHash.java index 16833fa..4f806de 100644 --- a/src/main/java/edu/umd/marbl/mhap/sketch/MinHash.java +++ b/src/main/java/edu/umd/marbl/mhap/sketch/MinHash.java @@ -58,7 +58,7 @@ public final class MinHash implements Serializable public final static int[] computeKmerMinHashesWeightedIntSuper(String seq, final int kmerSize, final int numHashes, - HashSet filter, KmerCounts kmerCount) + HashSet filter, KmerCounts kmerCount, boolean weighted) { final int numberKmers = seq.length() - kmerSize + 1; @@ -66,7 +66,7 @@ public final static int[] computeKmerMinHashesWeightedIntSuper(String seq, final throw new MhapRuntimeException("Kmer size bigger than string length."); // get the kmer hashes - final long[] kmerHashes = Utils.computeSequenceHashesLong(seq, kmerSize); + final long[] kmerHashes = Utils.computeSequenceHashesLong(seq, kmerSize, 0); //now compute the counts of occurance HashMap hitMap = new LinkedHashMap<>(kmerHashes.length); @@ -100,13 +100,23 @@ public final static int[] computeKmerMinHashesWeightedIntSuper(String seq, final { if ((kmerCount!=null && kmerCount.documentFrequencyRatio(key)>kmerCount.getFilterCutoff()) || (filter != null && filter.contains(key))) { - //System.err.println("Bad = "+kmerCount.inverseDocumentFrequency(key)+", "+kmerCount.weight(key, weight, maxCount)); + //System.err.println("Bad = "+kmerCount.inverseDocumentFrequency(key)+", "+kmerCount.weight(key, weight, maxCount)); + if (!weighted) + weight = 0; } else - weight = weight*3; + { + if (weighted) + weight = weight*3; + else + weight = 1; + } } //System.err.println("Good = "+kmerCount.inverseDocumentFrequency(key)+", "+kmerCount.weight(key, weight, maxCount)); //int weight = Math.min(1, (int)Math.round(kmerCount.weight(key, kmer.getValue().count, maxCount))); + + if (weight<=0) + continue; long x = key; @@ -241,14 +251,20 @@ private MinHash(int seqLength, int[] minHashes) this.minHashes = minHashes; } - public MinHash(Sequence seq, int kmerSize, int numHashes, HashSet filter, KmerCounts kmerCount) + public MinHash(Sequence seq, int kmerSize, int numHashes, HashSet filter, KmerCounts kmerCount, boolean weighted) { this.seqLength = seq.length(); //this.minHashes = MinHash.computeKmerMinHashes(seq.getString(), kmerSize, numHashes, filter); //this.minHashes = MinHash.computeKmerMinHashesWeighted(seq.getString(), kmerSize, numHashes, filter); //this.minHashes = MinHash.computeKmerMinHashesWeightedInt(seq.getString(), kmerSize, numHashes, filter, kmerCount); - this.minHashes = MinHash.computeKmerMinHashesWeightedIntSuper(seq.getString(), kmerSize, numHashes, filter, kmerCount); + this.minHashes = MinHash.computeKmerMinHashesWeightedIntSuper(seq.getString(), kmerSize, numHashes, filter, kmerCount, weighted); + } + + public MinHash(String str, int kmerSize, int numHashes) + { + this.seqLength = str.length(); + this.minHashes = MinHash.computeKmerMinHashesWeightedIntSuper(str, kmerSize, numHashes, null, null, true); } public byte[] getAsByteArray() diff --git a/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketch.java b/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketch.java index c8eb8fc..99a4478 100644 --- a/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketch.java +++ b/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketch.java @@ -94,10 +94,10 @@ public SequenceSketch(SequenceId id, MinHash mainHashes, OrderKmerHashes ordered } public SequenceSketch(Sequence seq, int kmerSize, int numHashes, int orderedKmerSize, - boolean storeHashes, HashSet filter, KmerCounts kmerCount) + boolean storeHashes, HashSet filter, KmerCounts kmerCount, boolean weighted) { this.id = seq.getId(); - this.mainHashes = new MinHash(seq, kmerSize, numHashes, filter, kmerCount); + this.mainHashes = new MinHash(seq, kmerSize, numHashes, filter, kmerCount, weighted); this.orderedHashes = new OrderKmerHashes(seq, orderedKmerSize); } diff --git a/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketchStreamer.java b/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketchStreamer.java index 33f95a7..a7e6e83 100644 --- a/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketchStreamer.java +++ b/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketchStreamer.java @@ -65,6 +65,7 @@ public class SequenceSketchStreamer private final AtomicLong numberSubSequencesProcessed; private final int numHashes; private final int offset; + private final boolean weighted; private final int orderedKmerSize; private boolean readClosed; @@ -78,6 +79,7 @@ public SequenceSketchStreamer(String file, int offset) throws FileNotFoundExcept this.sequenceHashList = new ConcurrentLinkedQueue(); this.numberProcessed = new AtomicLong(); this.kmerCounter = null; + this.weighted = true; this.kmerSize = 0; this.numHashes = 0; @@ -91,13 +93,14 @@ public SequenceSketchStreamer(String file, int offset) throws FileNotFoundExcept } public SequenceSketchStreamer(String file, int kmerSize, int numHashes, int subSequenceSize, int orderedKmerSize, - HashSet filter, KmerCounts kmerCounter, int offset) throws IOException + HashSet filter, KmerCounts kmerCounter, boolean weighted, int offset) throws IOException { this.fastaData = new FastaData(file, offset); this.readingFasta = true; this.sequenceHashList = new ConcurrentLinkedQueue(); this.numberProcessed = new AtomicLong(); + this.weighted = weighted; this.kmerCounter = kmerCounter; this.kmerSize = kmerSize; this.numHashes = numHashes; @@ -228,7 +231,7 @@ public int getFastaProcessed() public SequenceSketch getSketch(Sequence seq) { // compute the hashes - return new SequenceSketch(seq, this.kmerSize, this.numHashes, this.orderedKmerSize, false, this.filter, this.kmerCounter); + return new SequenceSketch(seq, this.kmerSize, this.numHashes, this.orderedKmerSize, false, this.filter, this.kmerCounter, this.weighted); } public int getNumberProcessed() diff --git a/src/main/java/edu/umd/marbl/mhap/utils/Utils.java b/src/main/java/edu/umd/marbl/mhap/utils/Utils.java index f230b59..aef041c 100644 --- a/src/main/java/edu/umd/marbl/mhap/utils/Utils.java +++ b/src/main/java/edu/umd/marbl/mhap/utils/Utils.java @@ -32,7 +32,6 @@ import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Random; @@ -48,8 +47,6 @@ import com.google.common.hash.Hasher; import com.google.common.hash.Hashing; -import edu.umd.marbl.mhap.general.Sequence; - public final class Utils { @@ -243,15 +240,15 @@ public final static int[] computeHashesIntString(String obj, int numWords, int s return hashes; } - public final static long[][] computeKmerHashes(final Sequence seq, final int kmerSize, final int numWords) + public final static long[][] computeKmerHashes(final String seq, final int kmerSize, final int numWords, final int seed) { - final int numberKmers = seq.numKmers(kmerSize); + final int numberKmers = seq.length()-kmerSize+1; if (numberKmers < 1) throw new MhapRuntimeException("Kmer size bigger than string length."); // get the rabin hashes - final int[] rabinHashes = computeSequenceHashes(seq.getString(), kmerSize); + final long[] rabinHashes = computeSequenceHashesLong(seq, kmerSize, seed); final long[][] hashes = new long[rabinHashes.length][numWords]; @@ -272,75 +269,29 @@ public final static long[][] computeKmerHashes(final Sequence seq, final int kme hashes[iter][word] = x; } } - + return hashes; } - - public final static int[][] computeKmerHashesInt(final Sequence seq, final int kmerSize, final int numWords, - HashSet filter) + + public final static long[][] computeKmerHashesExact(final String seq, final int kmerSize, final int numWords, final int seed) { - final int numberKmers = seq.numKmers(kmerSize); + HashFunction hf = Hashing.murmur3_128(seed); - if (numberKmers < 1) - throw new MhapRuntimeException("Kmer size bigger than string length."); - - // get the rabin hashes - final int[] rabinHashes = computeSequenceHashes(seq.getString(), kmerSize); - - final int[][] hashes = new int[rabinHashes.length][numWords]; - - // Random rand = new Random(0); - for (int iter = 0; iter < rabinHashes.length; iter++) + long[][] hashes = new long[seq.length() - kmerSize + 1][numWords]; + for (int iter = 0; iter < hashes.length; iter++) { - // do not compute minhash for filtered data, keep Integer.MAX_VALUE - if (filter != null && filter.contains(rabinHashes[iter])) + String subStr = seq.substring(iter, iter + kmerSize); + + for (int word=0; word>> 35); - x ^= (x << 4); - hashes[iter][word] = (int) x; + HashCode hc = hf.newHasher().putUnencodedChars(subStr).putInt(word).hash(); + hashes[iter][word] = hc.asLong(); } } - + return hashes; } - - public final static int[] computeKmerHashesIntBasic(final Sequence seq, final int kmerSize, final int numWords, - HashSet filter) - { - final int numberKmers = seq.numKmers(kmerSize); - - if (numberKmers < 1) - throw new MhapRuntimeException("Kmer size bigger than string length."); - - // get the rabin hashes - final int[] rabinHashes = computeSequenceHashes(seq.getString(), kmerSize); - - // Random rand = new Random(0); - for (int iter = 0; iter < rabinHashes.length; iter++) - { - // do not compute minhash for filtered data, keep Integer.MAX_VALUE - if (filter != null && filter.contains(rabinHashes[iter])) - { - rabinHashes[iter] = Integer.MAX_VALUE; - } - } - - return rabinHashes; - } - + public final static int[] computeSequenceHashes(final String seq, final int kmerSize) { // RollingSequenceHash rabinHash = new RollingSequenceHash(kmerSize); @@ -358,12 +309,9 @@ public final static int[] computeSequenceHashes(final String seq, final int kmer return hashes; } - public final static long[] computeSequenceHashesLong(final String seq, final int kmerSize) + public final static long[] computeSequenceHashesLong(final String seq, final int kmerSize, final int seed) { - // RollingSequenceHash rabinHash = new RollingSequenceHash(kmerSize); - // final int[] rabinHashes = rabinHash.hashInt(seq); - - HashFunction hf = Hashing.murmur3_128(0); + HashFunction hf = Hashing.murmur3_128(seed); long[] hashes = new long[seq.length() - kmerSize + 1]; for (int iter = 0; iter < hashes.length; iter++) @@ -375,20 +323,6 @@ public final static long[] computeSequenceHashesLong(final String seq, final int return hashes; } - public final static int[] computeStringNGramHashes(final String seq, final int ngramSize, final int seed) - { - HashFunction hf = Hashing.murmur3_32(seed); - - int[] hashes = new int[seq.length() - ngramSize + 1]; - for (int iter = 0; iter < hashes.length; iter++) - { - HashCode hc = hf.newHasher().putUnencodedChars(seq.substring(iter, iter + ngramSize)).hash(); - hashes[iter] = hc.asInt(); - } - - return hashes; - } - // add new line breaks every FASTA_LINE_LENGTH characters public final static String convertToFasta(String supplied) { @@ -427,7 +361,7 @@ public final static String convertToFasta(String supplied) } return converted.toString(); } - + public final static int countLetterInRead(String fasta, String letter) { return countLetterInRead(fasta, letter, false); @@ -459,7 +393,7 @@ public final static int countLetterInRead(String fasta, String letter, Boolean c return count; } - public final static HashSet createKmerFilter(String fileName, double maxPercent, int kmerSize) + public final static HashSet createKmerFilter(String fileName, double maxPercent, int kmerSize, int seed) throws IOException { File file = new File(fileName); @@ -484,7 +418,7 @@ public final static HashSet createKmerFilter(String fileName, double maxPe // if greater, add to hashset if (percent > maxPercent) { - long[] minHash = Utils.computeSequenceHashesLong(str[0], kmerSize); + long[] minHash = Utils.computeSequenceHashesLong(str[0], kmerSize, seed); if (minHash.length > 1) System.err.println("Warning filter file kmer size larger than setting!"); @@ -894,4 +828,50 @@ public final static String toProtein(String genome, boolean isReversed, int fram return result.toString(); } + + public static String toString(double[][] A) + { + StringBuilder s = new StringBuilder(); + + s.append("["); + + for (double[] a : A) + { + if (a != null) + { + for (int iter = 0; iter < a.length - 1; iter++) + s.append("" + a[iter] + ","); + + if (a.length > 0) + s.append("" + a[a.length - 1]); + } + s.append("\n"); + } + s.append("]"); + + return new String(s); + } + + public static String toString(long[][] A) + { + StringBuilder s = new StringBuilder(); + + s.append("["); + + for (long[] a : A) + { + if (a != null) + { + for (int iter = 0; iter < a.length - 1; iter++) + s.append("" + a[iter] + ","); + + if (a.length > 0) + s.append("" + a[a.length - 1]); + } + s.append("\n"); + } + s.append("]"); + + return new String(s); + } }