forked from SiTLar/pyt-vanilla-html
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hasher.js
70 lines (65 loc) · 1.72 KB
/
hasher.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"use strict";
var crc32 = require("crc-32");
define("hasher", [], function(){
function tokenize(original) {
var parts = original.replace(/[,.\/?\\~!`'"@#$%^&*()_+-=;:{}\[\]]/," " ).split(" ");
var shingles = new Array();
var kshingles = 12;
parts.forEach(function(part){
var size = part.length;
if(size <= kshingles) {
shingles.push(part.substr(0));
return;
}
for (var i = 0; i < size; i = i + kshingles) {
shingles.push(i + kshingles < size ? part.slice(i, i + kshingles) : part.slice(i));
}
});
return shingles;
};
function _Minhash (options){
this.fnum = (typeof options !== "undefined")? options.fnum:100;
this.fArr = new Array();
for(var idx = 0; idx<this.fnum; idx++){
this.fArr[idx] = {
"a": Math.floor(Math.random() * 0xffffffff)
,"b": Math.floor(Math.random() * 0xffffffff)
};
};
}
_Minhash.prototype = {
constructor: _Minhash
,"of": function (str){
var tokens = tokenize(str);
var shingles = tokens.map(crc32.str)
.map(function(hash){ return parseInt(hash, 16);});
return this.fArr.map(function(coef){
return Math.min.apply(null,shingles.map(function(shingle){
return (shingle*coef.a+coef.b) >> 32 ;
}));
});
/*
var out = new Object();
this.fArr.forEach(function(coef){
out[ Math.min.apply(null,shingles.map(function(shingle){
return (shingle*coef.a+coef.b) >> 32 ;
}))] = true;
});
return out;
*/
}
,"similarity": function(x,y){
//return x.reduce(function(acc, hash, idx){return acc + ((y[idx]==hash)?1:0); },0)/x.length;
/*
var hashes = Object.keys(x);
*/
var total = 0;
for(var idx = 0; idx< x.length; idx++)
total += ((y[idx] == x[idx] )?1:0);
return total/x.length;
}
}
return {
"_Minhash":_Minhash
}
});