diff --git a/README.md b/README.md index 6de3db1..46c313b 100644 --- a/README.md +++ b/README.md @@ -2,30 +2,32 @@ [![Build status](https://travis-ci.org/Hexagon/thinker-fts.svg)](https://travis-ci.org/Hexagon/thinker-fts) [![npm version](https://badge.fury.io/js/thinker-fts.svg)](https://badge.fury.io/js/thinker-fts) -Fast and extendible Node.js/Javascript fulltext search engine. +Fast and extendible Node.js/Javascript full text search engine. ## Features * Highly optimized, will give a ranked resultset within 20 ms on a 5000 (average wikipedia sized) document dataset. * In-memory operation - * Very few external dependencies + * Few external dependencies * Natural language search * Partial matching - * Expression correction/Suggestions + * Expression correction / suggestions * Weighted ranker (configurable weights for each field, all-expression-match-factor, partial vs exact factor etc.) * Field preprocessors * HTML-Stripper * Word preprocessors - * Swedish stemmer with stemmer stopwords - * Stopwords + * Swedish stemmer with stemmer stop words + * Stop words * Wordforms * Stripper for multiple characters - * Allows saving/loading the index to/from disk, it's a lot faster to load a previously saved index than generating it on the fly. + * Allows saving/loading the index to/from disk, but for small datasets you can feed the index on-the-fly. + ## Installation npm install thinker-fts + ## Quick-start A simple setup with feeding and searching would look something like the snippet below @@ -62,9 +64,10 @@ console.log(result); Please not that you _have to_ connect a ranker, else find won't provide a result set. The ranker build the result set. + ## Basic configuration -Thinkers default configuration is overridden by supplying an optoions object to Thinkers constructor. There is also a couple of settings that can be changed on runtime, both is shown below +Thinkers default configuration is overridden by supplying an options object to Thinkers constructor. There is also a couple of settings that can be changed on runtime, both is shown below ```javascript @@ -231,11 +234,11 @@ thinker.addWordProcessor(stopwords); thinker.ranker = ranker; ``` -#### Stopwords +#### Stop words Removes words that don't give better precision, normally stuff like 'and', 'I', 'they', 'we', 'can'. Adding the most common words here can speed up the quries a bit, and save some RAM. -Example setting up thinker with standard ranker and stopwords +Example setting up thinker with standard ranker and stop words ```javascript var thinker = Thinker(), @@ -250,21 +253,36 @@ thinker.addWordProcessor(stopwords); thinker.ranker = ranker; ``` -#### Stemmer +#### Stemmers Finds the stem of each word that is indexed, 'computers' will become 'computer', 'organized' will become 'organize' etc. This greatly improves accuracy of the matches and weighting. -An optinal feature of the stemmers is to supply a list of words that you don't want to stem down. Names is one thing you probably want to except from the stemmer. +An optional feature of the stemmers is to supply a list of words that you don't want to stem down. -Currently only available for swedish +Currently there is two stemmers available, swedish through a custom version of the Snowball algorithm, and english through the Porter algorithm. -Example setting up thinker with standard ranker, stemming, and stemmer stopwords +Example setting up thinker with standard ranker and english stemming ```javascript var thinker = Thinker(), ranker = Thinker.rankers.standard(), - stemmer = Thinker.processors.swedishStemmer({ + stemmer = Thinker.processors.stemmers.english(); + +thinker.addWordProcessor(stemmer); + +thinker.ranker = ranker; + +``` + + +Example setting up thinker with standard ranker, swedish stemming, and stemmer stop words + +```javascript +var + thinker = Thinker(), + ranker = Thinker.rankers.standard(), + stemmer = Thinker.processors.stemmers.swedish({ "stemmer": true, "stemming": true, "dontstemthiseither": true, @@ -277,11 +295,32 @@ thinker.addWordProcessor(stemmer); thinker.ranker = ranker; ``` + +## Dependencies + +Note: Dependencies is installed automatically by npm + + [fast-levenshtein](https://github.com/hiddentao/fast-levenshtein) (https://github.com/hiddentao/fast-levenshtein) + + [stemmer](https://github.com/wooorm/stemmer) (https://github.com/wooorm/stemmer) + + +## Development dependencies + +Note: Not needed for normal usage + + [mocha](https://github.com/mochajs/mocha) (https://github.com/mochajs/mocha) + + [should](https://github.com/shouldjs/should.js) (https://github.com/shouldjs/should.js) + + ## Credits [Hexagon](https://github.com/hexagon/) [Pehr Boman](https://github.com/unkelpehr/) + ## Licence + Licensed under the [MIT License](http://opensource.org/licenses/MIT) diff --git a/lib/processors.js b/lib/processors.js index 331a784..201c778 100644 --- a/lib/processors.js +++ b/lib/processors.js @@ -24,6 +24,8 @@ THE SOFTWARE. 'use strict'; +var porterStemmer = require('stemmer'); + function stopwords ( stopwords ) { var stopwords = stopwords || {}; return function ( w ) { @@ -236,8 +238,17 @@ function swedishStemmer ( stopwords ) { }; +function englishStemmer ( ) { + return function ( w ) { + return porterStemmer( w ); + }; +}; + module.exports = { - swedishStemmer: swedishStemmer, + stemmers: { + swedish: swedishStemmer, + english: englishStemmer + }, stopwords: stopwords, wordforms: wordforms, multiples: multiples, diff --git a/package.json b/package.json index 6d78393..68e2725 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,8 @@ "in-memory" ], "dependencies": { - "fast-levenshtein": "*" + "fast-levenshtein": "*", + "stemmer": "*" }, "devDependencies": { "mocha": "*", diff --git a/test/test.js b/test/test.js index cfcce0c..2ffd533 100644 --- a/test/test.js +++ b/test/test.js @@ -80,51 +80,6 @@ describe('Simple usage', function () { }); }); -describe('Stemmer', function () { - var stemmerStopwords = { - "anders": true, - "jonas": true - }; - - var thinker = Thinker(); - var ranker = Thinker.rankers.standard(); - var stemmer = Thinker.processors.swedishStemmer(stemmerStopwords); - var exampleTextsCopy = JSON.parse(JSON.stringify(exampleTexts)); - - thinker.addWordProcessor(stemmer); - thinker.ranker = ranker; - - thinker.feed(exampleTextsCopy, { - characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g - }); - - describe('Search for stopword "anders"', function () { - var result = thinker.find("anders"); - - it('Should return one expression', function () { - result.results.expressions.length.should.equal(1); - }); - - it('Expression interpretation be unchanged("anders")', function () { - result.results.expressions[0].interpretation.should.equal("anders"); - }); - - it('Should return two results', function () { - result.results.documents.length.should.equal(2); - }); - - it('First result should be a direct match (anders)', function () { - result.results.documents[0].directMatches.should.equal(1); - result.results.documents[0].partialMatches.should.equal(0); - }); - - it('Second result should be a partial match (andersson)', function () { - result.results.documents[1].directMatches.should.equal(0); - result.results.documents[1].partialMatches.should.equal(1); - }); - }); -}); - describe('Partial match', function () { var thinker = Thinker(); @@ -429,6 +384,354 @@ describe('Word-processor: Multiples', function () { }); }); + +describe('Word processor: Swedish stemmer', function () { + var stemmerStopwords = { + "anders": true, + "jonas": true + }; + + var thinker = Thinker(); + var ranker = Thinker.rankers.standard(); + var stemmer = Thinker.processors.stemmers.swedish(stemmerStopwords); + var exampleTextsCopy = JSON.parse(JSON.stringify(exampleTexts)); + + thinker.addWordProcessor(stemmer); + thinker.ranker = ranker; + + thinker.feed(exampleTextsCopy, { + characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g + }); + + describe('Search for stopword "anders"', function () { + var result = thinker.find("anders"); + + it('Should return one expression', function () { + result.results.expressions.length.should.equal(1); + }); + + it('Expression interpretation be unchanged("anders")', function () { + result.results.expressions[0].interpretation.should.equal("anders"); + }); + + it('Should return two results', function () { + result.results.documents.length.should.equal(2); + }); + + it('First result should be a direct match (anders)', function () { + result.results.documents[0].directMatches.should.equal(1); + result.results.documents[0].partialMatches.should.equal(0); + }); + + it('Second result should be a partial match (andersson)', function () { + result.results.documents[1].directMatches.should.equal(0); + result.results.documents[1].partialMatches.should.equal(1); + }); + }); + + describe('Search for "Bemötas"', function () { + + var result = thinker.find("Bemötas"); + + it('Should return one expression', function () { + result.results.expressions.length.should.equal(1); + }); + + it('Expression interpretation should equal "bemöt"', function () { + result.results.expressions[0].interpretation.should.equal("bemöt"); + }); + + it('Should return three results (bemötandet, bemötande, bemött)', function () { + result.results.documents.length.should.equal(3); + }); + + it('All results should be a direct match', function () { + result.results.documents[0].directMatches.should.equal(1); + result.results.documents[0].partialMatches.should.equal(0); + result.results.documents[1].directMatches.should.equal(1); + result.results.documents[1].partialMatches.should.equal(0); + result.results.documents[2].directMatches.should.equal(1); + result.results.documents[2].partialMatches.should.equal(0); + }); + + }); + + describe('Search for "nyheternas"', function () { + + var result = thinker.find("nyheternas"); + + it('Should return one expression', function () { + result.results.expressions.length.should.equal(1); + }); + + it('Expression interpretation should equal "nyhet"', function () { + result.results.expressions[0].interpretation.should.equal("nyhet"); + }); + + it('Should return 1 document', function () { + result.results.documents.length.should.equal(1); + }); + + it('All four (nyhet, nyheter, nyheten, nyhetens)results should be a direct match on the first result', function () { + result.results.documents[0].directMatches.should.equal(4); + result.results.documents[0].partialMatches.should.equal(0); + }); + + }); + + describe('Search for "nya"', function () { + + var result = thinker.find("nya"); + + it('Should return one expression', function () { + result.results.expressions.length.should.equal(1); + }); + + it('Expression interpretation should equal "ny"', function () { + result.results.expressions[0].interpretation.should.equal("ny"); + }); + + it('Should return one document', function () { + result.results.documents.length.should.equal(1); + }); + + it('The result should be a direct match on the first result', function () { + result.results.documents[0].directMatches.should.equal(1); + result.results.documents[0].partialMatches.should.equal(0); + }); + + }); + + describe('Search for "radioar"', function () { + + var result = thinker.find("radioar"); + + it('Expression interpretation should equal "radio"', function () { + result.results.expressions[0].interpretation.should.equal("radio"); + }); + + }); + + describe('Search for "sprit"', function () { + + var result = thinker.find("sprit"); + + it('Expression interpretation should equal "sprit"', function () { + result.results.expressions[0].interpretation.should.equal("sprit"); + }); + + }); + + describe('Search for "produktutveckling"', function () { + + var result = thinker.find("produktutveckling"); + + it('Expression interpretation should equal "produktutveckl"', function () { + result.results.expressions[0].interpretation.should.equal("produktutveckl"); + }); + + }); + + describe('Search for "produktutvecklare"', function () { + + var result = thinker.find("produktutvecklare"); + + it('Expression interpretation should equal "produktutveckl"', function () { + result.results.expressions[0].interpretation.should.equal("produktutveckl"); + }); + + }); + + describe('Search for "produktutvecklarens"', function () { + + var result = thinker.find("produktutvecklarens"); + + it('Expression interpretation should equal "produktutveckl"', function () { + result.results.expressions[0].interpretation.should.equal("produktutveckl"); + }); + + }); + + describe('Search for "skrotverktyget"', function () { + + var result = thinker.find("skrotverktyget"); + + it('Expression interpretation should equal "skrotverktyg"', function () { + result.results.expressions[0].interpretation.should.equal("skrotverktyg"); + }); + + }); + + + describe('Search for "skrotverktygets"', function () { + + var result = thinker.find("skrotverktygets"); + + + it('Expression interpretation should equal "skrotverktyg"', function () { + result.results.expressions[0].interpretation.should.equal("skrotverktyg"); + }); + + }); + + describe('Search for "sandning"', function () { + + var result = thinker.find("sandning"); + + it('Expression interpretation should equal "sand"', function () { + result.results.expressions[0].interpretation.should.equal("sand"); + }); + + }); + + describe('Search for "sand"', function () { + + var result = thinker.find("sand"); + + it('Expression interpretation should equal "sand"', function () { + result.results.expressions[0].interpretation.should.equal("sand"); + }); + + }); + + describe('Search for "sandarens"', function () { + + var result = thinker.find("sandarens"); + + it('Expression interpretation should equal "sand"', function () { + result.results.expressions[0].interpretation.should.equal("sand"); + }); + + }); + + + describe('Search for "skrotverktyg"', function () { + + var result = thinker.find("skrotverktyg"); + + it('Expression interpretation should equal "skrotverktyg"', function () { + result.results.expressions[0].interpretation.should.equal("skrotverktyg"); + }); + + }); + + describe('Search for "inbyggda"', function () { + + var result = thinker.find("inbyggda"); + + it('Expression interpretation should equal "inbygg"', function () { + result.results.expressions[0].interpretation.should.equal("inbygg"); + }); + + }); + + describe('Search for "inbyggd"', function () { + + var result = thinker.find("inbyggd"); + + it('Expression interpretation should equal "inbygg"', function () { + result.results.expressions[0].interpretation.should.equal("inbygg"); + }); + + }); + + describe('Search for "antikviteten"', function () { + + var result = thinker.find("antikviteten"); + + it('Should return one expression', function () { + result.results.expressions.length.should.equal(1); + }); + + it('Expression interpretation should equal "antikv"', function () { + result.results.expressions[0].interpretation.should.equal("antikv"); + }); + + it('Should return one result (antikviteten, antivitet, antikvitets)', function () { + result.results.documents.length.should.equal(3); + }); + + it('All results should be a direct match', function () { + result.results.documents[0].directMatches.should.equal(1); + result.results.documents[0].partialMatches.should.equal(0); + result.results.documents[1].directMatches.should.equal(1); + result.results.documents[1].partialMatches.should.equal(0); + result.results.documents[2].directMatches.should.equal(1); + result.results.documents[2].partialMatches.should.equal(0); + }); + + }); +}); + +describe('Word processor: English stemmer', function () { + + var thinker = Thinker(); + var ranker = Thinker.rankers.standard(); + var stemmer = Thinker.processors.stemmers.english(); + + var exampleTextsCopy = JSON.parse(JSON.stringify(exampleTexts)); + + thinker.addWordProcessor(stemmer); + thinker.ranker = ranker; + + thinker.feed(exampleTextsCopy); + + describe('Search for "considerable"', function () { + var result = thinker.find("considerable"); + + it('Should be interpreted as "consider"', function () { + result.results.expressions[0].interpretation.should.equal("consider"); + }); + + }); + + describe('Search for "triplicate"', function () { + var result = thinker.find("triplicate"); + + it('Should be interpreted as "triplic"', function () { + result.results.expressions[0].interpretation.should.equal("triplic"); + }); + + }); + + describe('Search for "dependent"', function () { + var result = thinker.find("dependent"); + + it('Should be interpreted as "depend"', function () { + result.results.expressions[0].interpretation.should.equal("depend"); + }); + + }); + + describe('Search for "probate"', function () { + var result = thinker.find("probate"); + + it('Should be interpreted as "probat"', function () { + result.results.expressions[0].interpretation.should.equal("probat"); + }); + + }); + + describe('Search for "controllable"', function () { + var result = thinker.find("controllable"); + + it('Should be interpreted as "control"', function () { + result.results.expressions[0].interpretation.should.equal("control"); + }); + + }); + + describe('Search for "rolling"', function () { + var result = thinker.find("rolling"); + + it('Should be interpreted as "roll"', function () { + result.results.expressions[0].interpretation.should.equal("roll"); + }); + + }); + +}); + describe('Field processor: HTML-Stripper', function () { var thinker = Thinker(); var ranker = Thinker.rankers.standard();