added simhash

marbl · Jan 31, 2015 · 5b60eb4 · 5b60eb4
1 parent aaf4927
commit 5b60eb4
Show file tree

Hide file tree

Showing 9 changed files with 152 additions and 154 deletions.
diff --git a/src/main/java/edu/umd/marbl/mhap/main/AlignmentHashRun.java b/src/main/java/edu/umd/marbl/mhap/main/AlignmentHashRun.java
@@ -87,7 +87,7 @@ public static void main(String[] args) throws Exception {
 		data.enqueueFullFile();
 
 		//SimHashSearch hashSearch = new SimHashSearch(kmerSize, numWords);
-		SequenceSketchStreamer seqStreamer = new SequenceSketchStreamer(inFile, kmerSize, numWords, DEFAULT_SUB_SEQUENCE_SIZE, 12, null, null, 0);
+		SequenceSketchStreamer seqStreamer = new SequenceSketchStreamer(inFile, kmerSize, numWords, DEFAULT_SUB_SEQUENCE_SIZE, 12, null, null, true, 0);
 
 		MinHashSearch hashSearch = new MinHashSearch(seqStreamer, numWords, DEFAULT_NUM_MIN_MATCHES, numThreads, true, 0, 800, DEFAULT_THRESHOLD);
 		System.err.println("Processed "+seqStreamer.getNumberProcessed()+" sequences.");

diff --git a/src/main/java/edu/umd/marbl/mhap/main/KmerStatSimulator.java b/src/main/java/edu/umd/marbl/mhap/main/KmerStatSimulator.java
@@ -186,8 +186,8 @@ public double compareKmers(String first, String second) {
 	}
 
 	public double compareMinHash(String first, String second) {
-		MinHash h1 = new MinHash(new Sequence(first, new SequenceId(1)), this.kmer, 1256, null, null);
-		MinHash h2 = new MinHash(new Sequence(second, new SequenceId(2)), this.kmer, 1256, null, null);
+		MinHash h1 = new MinHash(new Sequence(first, new SequenceId(1)), this.kmer, 1256, null, null, true);
+		MinHash h2 = new MinHash(new Sequence(second, new SequenceId(2)), this.kmer, 1256, null, null, true);
 
 		return h1.jaccard(h2);
 	}

diff --git a/src/main/java/edu/umd/marbl/mhap/main/MhapMain.java b/src/main/java/edu/umd/marbl/mhap/main/MhapMain.java
@@ -56,30 +56,19 @@
 public final class MhapMain
 {
 	private final double acceptScore;
-
 	private final HashSet<Long> filter;
-
 	private final String inFile;
-
 	private final int kmerSize;
-
 	private final double maxShift;
-
 	private final int minStoreLength;
-
 	private final boolean noSelf;
-
 	private final int numHashes;
-
 	private final int numMinMatches;
-
 	protected final int numThreads;
-
 	private final String processFile;
-
-	private final int subSequenceSize;
-
+	private final int subSequenceSize;	
 	private final String toFile;
+	private final boolean weighted;
 
 	private final KmerCounts kmerCounter;
 
@@ -121,6 +110,7 @@ public static void main(String[] args) throws Exception
 		options.addOption("--max-shift", "[double], region size to the left and right of the estimated overlap, as derived from the median shift and sequence length, where a k-mer matches are still considered valid. Second stage filter only.", DEFAULT_MAX_SHIFT_PERCENT);
 		options.addOption("--num-min-matches", "[int], minimum # min-mer that must be shared before computing second stage filter. Any sequences below that value are considered non-overlapping.", DEFAULT_NUM_MIN_MATCHES);
 		options.addOption("--num-threads", "[int], number of threads to use for computation. Typically set to 2 x #cores.", DEFAULT_NUM_THREADS);
+		options.addOption("--weighted", "Perform weighted MinHashing.", false);
 		options.addOption("--min-store-length", "[int], The minimum length of the read that is stored in the box. Used to filter out short reads from FASTA file.", DEFAULT_MIN_STORE_LENGTH);
 		options.addOption("--no-self", "Do not compute the overlaps between sequences inside a box. Should be used when the to and from sequences are coming from different files.", false);
 		options.addOption("--store-full-id", "Store full IDs as seen in FASTA file, rather than storing just the sequence position in the file. Some FASTA files have long IDS, slowing output of results. IDs not stored in compressed files.", false);
@@ -289,6 +279,7 @@ public MhapMain(ParseOptions options) throws IOException
 		this.minStoreLength = options.get("--min-store-length").getInteger();
 		this.maxShift = options.get("--max-shift").getDouble();
 		this.acceptScore = options.get("--threshold").getDouble();
+		this.weighted = options.get("--weighted").getBoolean();
 
 		// read in the kmer filter set
 		String filterFile = options.get("-f").getString();
@@ -299,7 +290,7 @@ public MhapMain(ParseOptions options) throws IOException
 			System.err.println("Reading in filter file " + filterFile + ".");
 			try
 			{
-				this.filter = Utils.createKmerFilter(filterFile, options.get("--filter-threshold").getDouble(), this.kmerSize);
+				this.filter = Utils.createKmerFilter(filterFile, options.get("--filter-threshold").getDouble(), this.kmerSize, 0);
 			}
 			catch (Exception e)
 			{
@@ -343,14 +334,14 @@ public void run()
 						while (seq != null)
 						{
 							//get the kmers integers
-							long[] kmerHashes = Utils.computeSequenceHashesLong(seq.getString(), MhapMain.this.kmerSize);
+							long[] kmerHashes = Utils.computeSequenceHashesLong(seq.getString(), MhapMain.this.kmerSize, 0);
 
 							//store the values
 							for (long val : kmerHashes)
 								countMin.add(val);
 
 							//get the kmers integers for reverse compliment
-							kmerHashes = Utils.computeSequenceHashesLong(seq.getReverseCompliment().getString(), MhapMain.this.kmerSize);
+							kmerHashes = Utils.computeSequenceHashesLong(seq.getReverseCompliment().getString(), MhapMain.this.kmerSize, 0);
 
 							//store the values
 							for (long val : kmerHashes)
@@ -581,7 +572,7 @@ public SequenceSketchStreamer getSequenceHashStreamer(String file, int offset) t
 			seqStreamer = new SequenceSketchStreamer(file, offset);
 		else
 			seqStreamer = new SequenceSketchStreamer(file, this.kmerSize, this.numHashes, this.subSequenceSize,
-					DEFAULT_ORDERED_KMER_SIZE, this.filter, this.kmerCounter, offset);
+					DEFAULT_ORDERED_KMER_SIZE, this.filter, this.kmerCounter, this.weighted, offset);
 
 		return seqStreamer;
 	}

diff --git a/...mhap/simhash/AbstractSequenceBitHash.java → ...marbl/mhap/simhash/AbstractBitSketch.java b/...mhap/simhash/AbstractSequenceBitHash.java → ...marbl/mhap/simhash/AbstractBitSketch.java
@@ -31,17 +31,23 @@
 
 import edu.umd.marbl.mhap.utils.MhapRuntimeException;
 
-public abstract class AbstractSequenceBitHash<T extends AbstractSequenceBitHash<T>> implements VectorHash<T>, Comparable<T>
+public abstract class AbstractBitSketch<T extends AbstractBitSketch<T>> implements VectorHash<T>,
+		Comparable<T>
 {
-	protected long bits[];
-
+	protected final long[] bits;
+
+	protected AbstractBitSketch(long[] bits)
+	{
+		this.bits = bits;
+	}
+
 	public final double adjScore(T sh)
 	{
-		double score = jaccord(sh);
-		
+		double score = jaccard(sh);
+
 		return score;
 	}
-	
+
 	public long[] getBits()
 	{
 		return this.bits;
@@ -50,59 +56,61 @@ public long[] getBits()
 	@Override
 	public int compareTo(final T sim)
 	{
-		for (int bitIndex=0; bitIndex<this.bits.length; bitIndex++)
+		for (int bitIndex = 0; bitIndex < this.bits.length; bitIndex++)
 		{
-			if (this.bits[bitIndex] < this.bits[bitIndex])
+			if (this.bits[bitIndex] < sim.bits[bitIndex])
 				return -1;
-			if (this.bits[bitIndex] > this.bits[bitIndex])
+			if (this.bits[bitIndex] > sim.bits[bitIndex])
 				return 1;
 		}
-		
+
 		return 0;
 	}
 
 	public final int getIntersectionCount(final T sh)
 	{
-		if (this.bits.length!=sh.bits.length)
+		if (this.bits.length != sh.bits.length)
 			throw new MhapRuntimeException("Size of bits in tables must match.");
-		
+
 		int count = 0;
-	  for (int longIndex=0; longIndex<this.bits.length; longIndex++)
-	  {	      
-	  	final long xor = this.bits[longIndex]^sh.bits[longIndex];
-	  	
-      count += Long.bitCount(xor);
-	  }
-		
-		return this.bits.length*64-count;
+		for (int longIndex = 0; longIndex < this.bits.length; longIndex++)
+		{
+			final long xor = this.bits[longIndex] ^ sh.bits[longIndex];
+
+			count += Long.bitCount(xor);
+		}
+
+		return this.bits.length * 64 - count;
 	}
 
 	@Override
-	public final double jaccord(final T sh)
+	public final double jaccard(final T sh)
 	{
 		int count = getIntersectionCount(sh);
 
-		return ((double)count/(double)(this.bits.length*64)-0.5)*2.0;
+		double jaccard = ((double) count / (double) (this.bits.length * 64) - 0.5) * 2.0;
+
+		return Math.max(0.0, jaccard);
 	}
 
 	@Override
 	public String toString()
 	{
 		StringBuilder s = new StringBuilder();
-	  for (int longIndex=0; longIndex<this.bits.length; longIndex++)
-	  {
-	  	long mask = 1L << 63;
-	  	
-      for (int bit=63; bit<=0; bit++)
-      {
-        if ((this.bits[longIndex]&mask)==0)
-  				s.append("0");
-        else
-  				s.append("1");
-        
-        mask = mask >>> 1;
-      }
-	  }		
+		for (int longIndex = 0; longIndex < this.bits.length; longIndex++)
+		{
+			long mask = 1L << 63;
+
+			for (int bit = 63; bit >= 0; bit--)
+			{
+				if ((this.bits[longIndex] & mask) == 0)
+					s.append("0");
+				else
+					s.append("1");
+
+				mask = mask >>> 1;
+			}
+		}
 		return s.toString();
 	}
 }
diff --git a/src/main/java/edu/umd/marbl/mhap/simhash/VectorHash.java b/src/main/java/edu/umd/marbl/mhap/simhash/VectorHash.java
@@ -31,5 +31,5 @@
 
 public interface VectorHash<T>
 {
-	double jaccord(T sh);
+	double jaccard(T sh);
 }
diff --git a/src/main/java/edu/umd/marbl/mhap/sketch/MinHash.java b/src/main/java/edu/umd/marbl/mhap/sketch/MinHash.java
@@ -58,15 +58,15 @@ public final class MinHash implements Serializable
 
 
 	public final static int[] computeKmerMinHashesWeightedIntSuper(String seq, final int kmerSize, final int numHashes,
-			HashSet<Long> filter, KmerCounts kmerCount)
+			HashSet<Long> filter, KmerCounts kmerCount, boolean weighted)
 	{
 		final int numberKmers = seq.length() - kmerSize + 1;
 
 		if (numberKmers < 1)
 			throw new MhapRuntimeException("Kmer size bigger than string length.");
 
 		// get the kmer hashes
-		final long[] kmerHashes = Utils.computeSequenceHashesLong(seq, kmerSize);
+		final long[] kmerHashes = Utils.computeSequenceHashesLong(seq, kmerSize, 0);
 
 		//now compute the counts of occurance
 		HashMap<Long, HitCounter> hitMap = new LinkedHashMap<>(kmerHashes.length);
@@ -100,13 +100,23 @@ public final static int[] computeKmerMinHashesWeightedIntSuper(String seq, final
 			{				
 				if ((kmerCount!=null && kmerCount.documentFrequencyRatio(key)>kmerCount.getFilterCutoff()) || (filter != null && filter.contains(key)))
 				{
-					//System.err.println("Bad = "+kmerCount.inverseDocumentFrequency(key)+", "+kmerCount.weight(key, weight, maxCount));								
+					//System.err.println("Bad = "+kmerCount.inverseDocumentFrequency(key)+", "+kmerCount.weight(key, weight, maxCount));
+					if (!weighted)
+						weight = 0;
 				}
 				else
-					weight = weight*3;
+				{
+					if (weighted)
+						weight = weight*3;
+					else
+						weight = 1;
+				}
 			}
 			//System.err.println("Good = "+kmerCount.inverseDocumentFrequency(key)+", "+kmerCount.weight(key, weight, maxCount));
 			//int weight = Math.min(1, (int)Math.round(kmerCount.weight(key, kmer.getValue().count, maxCount)));
+
+			if (weight<=0)
+				continue;
 
 			long x = key;
 
@@ -241,14 +251,20 @@ private MinHash(int seqLength, int[] minHashes)
 		this.minHashes = minHashes;
 	}
 
-	public MinHash(Sequence seq, int kmerSize, int numHashes, HashSet<Long> filter, KmerCounts kmerCount)
+	public MinHash(Sequence seq, int kmerSize, int numHashes, HashSet<Long> filter, KmerCounts kmerCount, boolean weighted)
 	{
 		this.seqLength = seq.length();
 
 		//this.minHashes = MinHash.computeKmerMinHashes(seq.getString(), kmerSize, numHashes, filter);
 		//this.minHashes = MinHash.computeKmerMinHashesWeighted(seq.getString(), kmerSize, numHashes, filter);
 		//this.minHashes = MinHash.computeKmerMinHashesWeightedInt(seq.getString(), kmerSize, numHashes, filter, kmerCount);
-		this.minHashes = MinHash.computeKmerMinHashesWeightedIntSuper(seq.getString(), kmerSize, numHashes, filter, kmerCount);
+		this.minHashes = MinHash.computeKmerMinHashesWeightedIntSuper(seq.getString(), kmerSize, numHashes, filter, kmerCount, weighted);
+	}
+
+	public MinHash(String str, int kmerSize, int numHashes)
+	{
+		this.seqLength = str.length();
+		this.minHashes = MinHash.computeKmerMinHashesWeightedIntSuper(str, kmerSize, numHashes, null, null, true);
 	}
 
 	public byte[] getAsByteArray()

diff --git a/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketch.java b/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketch.java
@@ -94,10 +94,10 @@ public SequenceSketch(SequenceId id, MinHash mainHashes, OrderKmerHashes ordered
 	}
 
 	public SequenceSketch(Sequence seq, int kmerSize, int numHashes, int orderedKmerSize, 
-			boolean storeHashes, HashSet<Long> filter, KmerCounts kmerCount)
+			boolean storeHashes, HashSet<Long> filter, KmerCounts kmerCount, boolean weighted)
 	{
 		this.id = seq.getId();
-		this.mainHashes = new MinHash(seq, kmerSize, numHashes, filter, kmerCount);
+		this.mainHashes = new MinHash(seq, kmerSize, numHashes, filter, kmerCount, weighted);
 		this.orderedHashes = new OrderKmerHashes(seq, orderedKmerSize);
 	}
 

diff --git a/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketchStreamer.java b/src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketchStreamer.java
@@ -65,6 +65,7 @@ public class SequenceSketchStreamer
 	private final AtomicLong numberSubSequencesProcessed;
 	private final int numHashes;
 	private final int offset;
+	private final boolean weighted;
 
 	private final int orderedKmerSize;
 	private boolean readClosed;
@@ -78,6 +79,7 @@ public SequenceSketchStreamer(String file, int offset) throws FileNotFoundExcept
 		this.sequenceHashList = new ConcurrentLinkedQueue<SequenceSketch>();
 		this.numberProcessed = new AtomicLong();
 		this.kmerCounter = null;
+		this.weighted = true;
 
 		this.kmerSize = 0;
 		this.numHashes = 0;
@@ -91,13 +93,14 @@ public SequenceSketchStreamer(String file, int offset) throws FileNotFoundExcept
 	}
 
 	public SequenceSketchStreamer(String file, int kmerSize, int numHashes, int subSequenceSize, int orderedKmerSize,
-			HashSet<Long> filter, KmerCounts kmerCounter, int offset) throws IOException
+			HashSet<Long> filter, KmerCounts kmerCounter, boolean weighted, int offset) throws IOException
 	{
 		this.fastaData = new FastaData(file, offset);
 		this.readingFasta = true;
 		this.sequenceHashList = new ConcurrentLinkedQueue<SequenceSketch>();
 		this.numberProcessed = new AtomicLong();
 
+		this.weighted = weighted;
 		this.kmerCounter = kmerCounter;
 		this.kmerSize = kmerSize;
 		this.numHashes = numHashes;
@@ -228,7 +231,7 @@ public int getFastaProcessed()
 	public SequenceSketch getSketch(Sequence seq)
 	{
 		// compute the hashes
-		return new SequenceSketch(seq, this.kmerSize, this.numHashes, this.orderedKmerSize, false, this.filter, this.kmerCounter);
+		return new SequenceSketch(seq, this.kmerSize, this.numHashes, this.orderedKmerSize, false, this.filter, this.kmerCounter, this.weighted);
 	}
 
 	public int getNumberProcessed()