Skip to content

Commit

Permalink
added simhash
Browse files Browse the repository at this point in the history
  • Loading branch information
konstantinberlin committed Jan 31, 2015
1 parent aaf4927 commit 5b60eb4
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 154 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ public static void main(String[] args) throws Exception {
data.enqueueFullFile();

//SimHashSearch hashSearch = new SimHashSearch(kmerSize, numWords);
SequenceSketchStreamer seqStreamer = new SequenceSketchStreamer(inFile, kmerSize, numWords, DEFAULT_SUB_SEQUENCE_SIZE, 12, null, null, 0);
SequenceSketchStreamer seqStreamer = new SequenceSketchStreamer(inFile, kmerSize, numWords, DEFAULT_SUB_SEQUENCE_SIZE, 12, null, null, true, 0);

MinHashSearch hashSearch = new MinHashSearch(seqStreamer, numWords, DEFAULT_NUM_MIN_MATCHES, numThreads, true, 0, 800, DEFAULT_THRESHOLD);
System.err.println("Processed "+seqStreamer.getNumberProcessed()+" sequences.");
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/edu/umd/marbl/mhap/main/KmerStatSimulator.java
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,8 @@ public double compareKmers(String first, String second) {
}

public double compareMinHash(String first, String second) {
MinHash h1 = new MinHash(new Sequence(first, new SequenceId(1)), this.kmer, 1256, null, null);
MinHash h2 = new MinHash(new Sequence(second, new SequenceId(2)), this.kmer, 1256, null, null);
MinHash h1 = new MinHash(new Sequence(first, new SequenceId(1)), this.kmer, 1256, null, null, true);
MinHash h2 = new MinHash(new Sequence(second, new SequenceId(2)), this.kmer, 1256, null, null, true);

return h1.jaccard(h2);
}
Expand Down
25 changes: 8 additions & 17 deletions src/main/java/edu/umd/marbl/mhap/main/MhapMain.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,30 +56,19 @@
public final class MhapMain
{
private final double acceptScore;

private final HashSet<Long> filter;

private final String inFile;

private final int kmerSize;

private final double maxShift;

private final int minStoreLength;

private final boolean noSelf;

private final int numHashes;

private final int numMinMatches;

protected final int numThreads;

private final String processFile;

private final int subSequenceSize;

private final int subSequenceSize;
private final String toFile;
private final boolean weighted;

private final KmerCounts kmerCounter;

Expand Down Expand Up @@ -121,6 +110,7 @@ public static void main(String[] args) throws Exception
options.addOption("--max-shift", "[double], region size to the left and right of the estimated overlap, as derived from the median shift and sequence length, where a k-mer matches are still considered valid. Second stage filter only.", DEFAULT_MAX_SHIFT_PERCENT);
options.addOption("--num-min-matches", "[int], minimum # min-mer that must be shared before computing second stage filter. Any sequences below that value are considered non-overlapping.", DEFAULT_NUM_MIN_MATCHES);
options.addOption("--num-threads", "[int], number of threads to use for computation. Typically set to 2 x #cores.", DEFAULT_NUM_THREADS);
options.addOption("--weighted", "Perform weighted MinHashing.", false);
options.addOption("--min-store-length", "[int], The minimum length of the read that is stored in the box. Used to filter out short reads from FASTA file.", DEFAULT_MIN_STORE_LENGTH);
options.addOption("--no-self", "Do not compute the overlaps between sequences inside a box. Should be used when the to and from sequences are coming from different files.", false);
options.addOption("--store-full-id", "Store full IDs as seen in FASTA file, rather than storing just the sequence position in the file. Some FASTA files have long IDS, slowing output of results. IDs not stored in compressed files.", false);
Expand Down Expand Up @@ -289,6 +279,7 @@ public MhapMain(ParseOptions options) throws IOException
this.minStoreLength = options.get("--min-store-length").getInteger();
this.maxShift = options.get("--max-shift").getDouble();
this.acceptScore = options.get("--threshold").getDouble();
this.weighted = options.get("--weighted").getBoolean();

// read in the kmer filter set
String filterFile = options.get("-f").getString();
Expand All @@ -299,7 +290,7 @@ public MhapMain(ParseOptions options) throws IOException
System.err.println("Reading in filter file " + filterFile + ".");
try
{
this.filter = Utils.createKmerFilter(filterFile, options.get("--filter-threshold").getDouble(), this.kmerSize);
this.filter = Utils.createKmerFilter(filterFile, options.get("--filter-threshold").getDouble(), this.kmerSize, 0);
}
catch (Exception e)
{
Expand Down Expand Up @@ -343,14 +334,14 @@ public void run()
while (seq != null)
{
//get the kmers integers
long[] kmerHashes = Utils.computeSequenceHashesLong(seq.getString(), MhapMain.this.kmerSize);
long[] kmerHashes = Utils.computeSequenceHashesLong(seq.getString(), MhapMain.this.kmerSize, 0);

//store the values
for (long val : kmerHashes)
countMin.add(val);

//get the kmers integers for reverse compliment
kmerHashes = Utils.computeSequenceHashesLong(seq.getReverseCompliment().getString(), MhapMain.this.kmerSize);
kmerHashes = Utils.computeSequenceHashesLong(seq.getReverseCompliment().getString(), MhapMain.this.kmerSize, 0);

//store the values
for (long val : kmerHashes)
Expand Down Expand Up @@ -581,7 +572,7 @@ public SequenceSketchStreamer getSequenceHashStreamer(String file, int offset) t
seqStreamer = new SequenceSketchStreamer(file, offset);
else
seqStreamer = new SequenceSketchStreamer(file, this.kmerSize, this.numHashes, this.subSequenceSize,
DEFAULT_ORDERED_KMER_SIZE, this.filter, this.kmerCounter, offset);
DEFAULT_ORDERED_KMER_SIZE, this.filter, this.kmerCounter, this.weighted, offset);

return seqStreamer;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,23 @@

import edu.umd.marbl.mhap.utils.MhapRuntimeException;

public abstract class AbstractSequenceBitHash<T extends AbstractSequenceBitHash<T>> implements VectorHash<T>, Comparable<T>
public abstract class AbstractBitSketch<T extends AbstractBitSketch<T>> implements VectorHash<T>,
Comparable<T>
{
protected long bits[];

protected final long[] bits;

protected AbstractBitSketch(long[] bits)
{
this.bits = bits;
}

public final double adjScore(T sh)
{
double score = jaccord(sh);
double score = jaccard(sh);

return score;
}

public long[] getBits()
{
return this.bits;
Expand All @@ -50,59 +56,61 @@ public long[] getBits()
@Override
public int compareTo(final T sim)
{
for (int bitIndex=0; bitIndex<this.bits.length; bitIndex++)
for (int bitIndex = 0; bitIndex < this.bits.length; bitIndex++)
{
if (this.bits[bitIndex] < this.bits[bitIndex])
if (this.bits[bitIndex] < sim.bits[bitIndex])
return -1;
if (this.bits[bitIndex] > this.bits[bitIndex])
if (this.bits[bitIndex] > sim.bits[bitIndex])
return 1;
}

return 0;
}

public final int getIntersectionCount(final T sh)
{
if (this.bits.length!=sh.bits.length)
if (this.bits.length != sh.bits.length)
throw new MhapRuntimeException("Size of bits in tables must match.");

int count = 0;
for (int longIndex=0; longIndex<this.bits.length; longIndex++)
{
final long xor = this.bits[longIndex]^sh.bits[longIndex];
count += Long.bitCount(xor);
}
return this.bits.length*64-count;
for (int longIndex = 0; longIndex < this.bits.length; longIndex++)
{
final long xor = this.bits[longIndex] ^ sh.bits[longIndex];

count += Long.bitCount(xor);
}

return this.bits.length * 64 - count;
}

@Override
public final double jaccord(final T sh)
public final double jaccard(final T sh)
{
int count = getIntersectionCount(sh);

return ((double)count/(double)(this.bits.length*64)-0.5)*2.0;
double jaccard = ((double) count / (double) (this.bits.length * 64) - 0.5) * 2.0;

return Math.max(0.0, jaccard);
}

@Override
public String toString()
{
StringBuilder s = new StringBuilder();
for (int longIndex=0; longIndex<this.bits.length; longIndex++)
{
long mask = 1L << 63;
for (int bit=63; bit<=0; bit++)
{
if ((this.bits[longIndex]&mask)==0)
s.append("0");
else
s.append("1");
mask = mask >>> 1;
}
}
for (int longIndex = 0; longIndex < this.bits.length; longIndex++)
{
long mask = 1L << 63;

for (int bit = 63; bit >= 0; bit--)
{
if ((this.bits[longIndex] & mask) == 0)
s.append("0");
else
s.append("1");

mask = mask >>> 1;
}
}
return s.toString();
}
}
2 changes: 1 addition & 1 deletion src/main/java/edu/umd/marbl/mhap/simhash/VectorHash.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@

public interface VectorHash<T>
{
double jaccord(T sh);
double jaccard(T sh);
}
28 changes: 22 additions & 6 deletions src/main/java/edu/umd/marbl/mhap/sketch/MinHash.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,15 @@ public final class MinHash implements Serializable


public final static int[] computeKmerMinHashesWeightedIntSuper(String seq, final int kmerSize, final int numHashes,
HashSet<Long> filter, KmerCounts kmerCount)
HashSet<Long> filter, KmerCounts kmerCount, boolean weighted)
{
final int numberKmers = seq.length() - kmerSize + 1;

if (numberKmers < 1)
throw new MhapRuntimeException("Kmer size bigger than string length.");

// get the kmer hashes
final long[] kmerHashes = Utils.computeSequenceHashesLong(seq, kmerSize);
final long[] kmerHashes = Utils.computeSequenceHashesLong(seq, kmerSize, 0);

//now compute the counts of occurance
HashMap<Long, HitCounter> hitMap = new LinkedHashMap<>(kmerHashes.length);
Expand Down Expand Up @@ -100,13 +100,23 @@ public final static int[] computeKmerMinHashesWeightedIntSuper(String seq, final
{
if ((kmerCount!=null && kmerCount.documentFrequencyRatio(key)>kmerCount.getFilterCutoff()) || (filter != null && filter.contains(key)))
{
//System.err.println("Bad = "+kmerCount.inverseDocumentFrequency(key)+", "+kmerCount.weight(key, weight, maxCount));
//System.err.println("Bad = "+kmerCount.inverseDocumentFrequency(key)+", "+kmerCount.weight(key, weight, maxCount));
if (!weighted)
weight = 0;
}
else
weight = weight*3;
{
if (weighted)
weight = weight*3;
else
weight = 1;
}
}
//System.err.println("Good = "+kmerCount.inverseDocumentFrequency(key)+", "+kmerCount.weight(key, weight, maxCount));
//int weight = Math.min(1, (int)Math.round(kmerCount.weight(key, kmer.getValue().count, maxCount)));

if (weight<=0)
continue;

long x = key;

Expand Down Expand Up @@ -241,14 +251,20 @@ private MinHash(int seqLength, int[] minHashes)
this.minHashes = minHashes;
}

public MinHash(Sequence seq, int kmerSize, int numHashes, HashSet<Long> filter, KmerCounts kmerCount)
public MinHash(Sequence seq, int kmerSize, int numHashes, HashSet<Long> filter, KmerCounts kmerCount, boolean weighted)
{
this.seqLength = seq.length();

//this.minHashes = MinHash.computeKmerMinHashes(seq.getString(), kmerSize, numHashes, filter);
//this.minHashes = MinHash.computeKmerMinHashesWeighted(seq.getString(), kmerSize, numHashes, filter);
//this.minHashes = MinHash.computeKmerMinHashesWeightedInt(seq.getString(), kmerSize, numHashes, filter, kmerCount);
this.minHashes = MinHash.computeKmerMinHashesWeightedIntSuper(seq.getString(), kmerSize, numHashes, filter, kmerCount);
this.minHashes = MinHash.computeKmerMinHashesWeightedIntSuper(seq.getString(), kmerSize, numHashes, filter, kmerCount, weighted);
}

public MinHash(String str, int kmerSize, int numHashes)
{
this.seqLength = str.length();
this.minHashes = MinHash.computeKmerMinHashesWeightedIntSuper(str, kmerSize, numHashes, null, null, true);
}

public byte[] getAsByteArray()
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/edu/umd/marbl/mhap/sketch/SequenceSketch.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@ public SequenceSketch(SequenceId id, MinHash mainHashes, OrderKmerHashes ordered
}

public SequenceSketch(Sequence seq, int kmerSize, int numHashes, int orderedKmerSize,
boolean storeHashes, HashSet<Long> filter, KmerCounts kmerCount)
boolean storeHashes, HashSet<Long> filter, KmerCounts kmerCount, boolean weighted)
{
this.id = seq.getId();
this.mainHashes = new MinHash(seq, kmerSize, numHashes, filter, kmerCount);
this.mainHashes = new MinHash(seq, kmerSize, numHashes, filter, kmerCount, weighted);
this.orderedHashes = new OrderKmerHashes(seq, orderedKmerSize);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ public class SequenceSketchStreamer
private final AtomicLong numberSubSequencesProcessed;
private final int numHashes;
private final int offset;
private final boolean weighted;

private final int orderedKmerSize;
private boolean readClosed;
Expand All @@ -78,6 +79,7 @@ public SequenceSketchStreamer(String file, int offset) throws FileNotFoundExcept
this.sequenceHashList = new ConcurrentLinkedQueue<SequenceSketch>();
this.numberProcessed = new AtomicLong();
this.kmerCounter = null;
this.weighted = true;

this.kmerSize = 0;
this.numHashes = 0;
Expand All @@ -91,13 +93,14 @@ public SequenceSketchStreamer(String file, int offset) throws FileNotFoundExcept
}

public SequenceSketchStreamer(String file, int kmerSize, int numHashes, int subSequenceSize, int orderedKmerSize,
HashSet<Long> filter, KmerCounts kmerCounter, int offset) throws IOException
HashSet<Long> filter, KmerCounts kmerCounter, boolean weighted, int offset) throws IOException
{
this.fastaData = new FastaData(file, offset);
this.readingFasta = true;
this.sequenceHashList = new ConcurrentLinkedQueue<SequenceSketch>();
this.numberProcessed = new AtomicLong();

this.weighted = weighted;
this.kmerCounter = kmerCounter;
this.kmerSize = kmerSize;
this.numHashes = numHashes;
Expand Down Expand Up @@ -228,7 +231,7 @@ public int getFastaProcessed()
public SequenceSketch getSketch(Sequence seq)
{
// compute the hashes
return new SequenceSketch(seq, this.kmerSize, this.numHashes, this.orderedKmerSize, false, this.filter, this.kmerCounter);
return new SequenceSketch(seq, this.kmerSize, this.numHashes, this.orderedKmerSize, false, this.filter, this.kmerCounter, this.weighted);
}

public int getNumberProcessed()
Expand Down
Loading

0 comments on commit 5b60eb4

Please sign in to comment.