Skip to content

Commit

Permalink
add synonym analyzer mapping gene symbols containing spaces
Browse files Browse the repository at this point in the history
  • Loading branch information
sarmbruster committed Apr 5, 2020
1 parent 2d8ec02 commit 6bb0901
Show file tree
Hide file tree
Showing 7 changed files with 305 additions and 3 deletions.
5 changes: 5 additions & 0 deletions readme.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Set up a fulltext index and specificy the custom index:
[source,cypher]
----
CALL db.index.fulltext.createNodeIndex("myindex", ["Articles"], ["abstract", "content"], {analyzers: "whitespace_lower"});
CALL db.index.fulltext.createNodeIndex("myindex", ["Articles"], ["abstract", "content"], {analyzers: "synonym"});
----

## Reference of available analzyers
Expand All @@ -31,3 +32,7 @@ CALL db.index.fulltext.createNodeIndex("myindex", ["Articles"], ["abstract", "co
This analyzer combines the `whitespace` analyzer with German and English stopword lists and applies a toLower conversion as well.
Due to usage of `whitespace` terms containing dashes, e.g. `PNAS-108` as gene symbol or technical terms like `x-270°C` are treated as one atomic term.

### synonym

Synonym analyzer is aware of gene symbols containing whitespace, e.g. `cGK 1`. Those multi-word terms will be mapping to a single word alternative using Lucene's `SynonymFilterFactory`. Aside of this mapping, the `synonym` analyzer works exactly like `whitespace_lower`

Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package org.neo4j.contrib.analyzers;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.neo4j.graphdb.index.fulltext.AnalyzerProvider;
import org.neo4j.helpers.Service;

import java.io.IOException;

@Service.Implementation(AnalyzerProvider.class)
public class SynonymAnalyzerProvider extends AnalyzerProvider {

public static final String DESCRIPTION = "analyzer using synonyms";
public static final String ANALYZER_NAME = "synonym";

public SynonymAnalyzerProvider() {
super(ANALYZER_NAME, new String[0]);
}

public Analyzer createAnalyzer() {
try {
return CustomAnalyzer.builder()
.withTokenizer(WhitespaceTokenizerFactory.class)
.addTokenFilter(SynonymFilterFactory.class, "synonyms", "gene_symbols.txt", "ignoreCase", "true")
.addTokenFilter(StopFilterFactory.class, "format", "snowball", "words", "org/apache/lucene/analysis/snowball/english_stop.txt,org/apache/lucene/analysis/snowball/german_stop.txt", "ignoreCase", "true")
.addTokenFilter(LowerCaseFilterFactory.class)
.build();
} catch (IOException e) {
throw new RuntimeException(e);
}
}

@Override
public String description() {
return DESCRIPTION;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
public class WhitespaceLowerAnalyzerProvider extends AnalyzerProvider {

public static final String DESCRIPTION = "same as whitespace analyzer, but additionally applies a lower case filter to all tokens";
public static final String ANALYZER_NAME = "whitespace_lower";

public WhitespaceLowerAnalyzerProvider() {
super("whitespace_lower", new String[0]);
super(ANALYZER_NAME, new String[0]);
}

public Analyzer createAnalyzer() {
Expand Down
203 changes: 203 additions & 0 deletions src/main/resources/gene_symbols.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# this contains the gene symbol mappings, built with
# echo "match (g:GeneSymbol:OmitInSearch) where g.sid contains ' ' return g.sid order by g.sid ;" | cypher-shell -a bolt://covid.petesis.com:7687 -u neo4j -p <pw> | sed -e 's/^"//' -e 's/"$//' | gawk -F '|' '{printf "%s => ", $1 } {gsub(" ", ""); print}' > ~/lib/neo4j/neo4j-additional-analyzers/src/main/resources/gene_symbols.txt
#
1-AGPAT 3 => 1-AGPAT3
1-AGPAT 6 => 1-AGPAT6
ADAM 21 => ADAM21
ADAM 22 => ADAM22
ADAM 28 => ADAM28
ADAM 7 => ADAM7
ADAM-TS 11 => ADAM-TS11
ADAM-TS 5 => ADAM-TS5
ADAM-TS 6 => ADAM-TS6
ADAM-TS 7 => ADAM-TS7
ADPRC 1 => ADPRC1
ADPRT 1 => ADPRT1
ADSS 2 => ADSS2
AFB1-AR 3 => AFB1-AR3
AGPAT 10 => AGPAT10
AK 4 => AK4
AK 7 => AK7
AK 8 => AK8
AK 9 => AK9
AKAP 110 => AKAP110
AKAP 82 => AKAP82
AKAP 95 => AKAP95
AKR1B10L, AK1R1B7 => AKR1B10L,AK1R1B7
AMan II => AManII
ANT 1 => ANT1
ANT 2 => ANT2
ANT 3 => ANT3
ANT 4 => ANT4
AVPR V1a => AVPRV1a
BCKDH E1-beta => BCKDHE1-beta
C C CKR3 => CCCKR3
C-C CKR-6 => C-CCKR-6
CA 15-3 => CA15-3
CA-RP II => CA-RPII
CAM-PDE 1A => CAM-PDE1A
CCCAP SLSN7 => CCCAPSLSN7
CCX CKR => CCXCKR
CDS 1 => CDS1
CGI-PDE A => CGI-PDEA
CI-18 kDa => CI-18kDa
CKI-gamma 3 => CKI-gamma3
CKR 3 => CKR3
COX IV-1 => COXIV-1
CaM KMT => CaMKMT
CaMK IV => CaMKIV
ColGalT 1 => ColGalT1
ColGalT 2 => ColGalT2
DC LAMP => DCLAMP
DEE 2 => DEE2
DPP IX => DPPIX
Dyna III => DynaIII
E-NPP 7 => E-NPP7
EBSP, hNATL => EBSP,hNATL
EKI 1 => EKI1
EPAC 2 => EPAC2
Exo V => ExoV
F9 p22 => F9p22
GAP (1-12) => GAP(1-12)
GAP (1-8) => GAP(1-8)
GCAP 2 => GCAP2
GFAT 1 => GFAT1
GFAT 2 => GFAT2
GGT 1 => GGT1
GGT 2 => GGT2
GGT 5 => GGT5
GIIE sPLA2 => GIIEsPLA2
GMPR 1 => GMPR1
GMPR 2 => GMPR2
GPT 2 => GPT2
GST 13-13 => GST13-13
GSTO 1-1 => GSTO1-1
GSTO 2-2 => GSTO2-2
GTF2IRD2 alpha => GTF2IRD2alpha
GnT I.2 => GnTI.2
H3-K9-HMTase 1 => H3-K9-HMTase1
HSF 2 => HSF2
HSF 5 => HSF5
HSP 75 => HSP75
HSTF 2 => HSTF2
HSTF 5 => HSTF5
IL-1 alpha => IL-1alpha
IMP 3 => IMP3
IPS 1 => IPS1
IRE-BP 2 => IRE-BP2
ISG-54 K => ISG-54K
K-Ras 2 => K-Ras2
LACS 3 => LACS3
LACS 6 => LACS6
LCB 3 => LCB3
LPLAT 1 => LPLAT1
LPLAT 2 => LPLAT2
LPLAT 5 => LPLAT5
LTB4-R 2 => LTB4-R2
LeIF F => LeIFF
MAP 1D => MAP1D
MAPK 12 => MAPK12
MAPK 13 => MAPK13
MAST 9 => MAST9
MCT 11 => MCT11
MCT 3 => MCT3
MCT 4 => MCT4
MCT 7 => MCT7
MCT 8 => MCT8
MEK 7 => MEK7
MEKK 1 => MEKK1
MEKK 4 => MEKK4
MEKKK 3 => MEKKK3
MIP-1 delta => MIP-1delta
MLN 19 => MLN19
MPR 300 => MPR300
MPR 46 => MPR46
MT-MMP 1 => MT-MMP1
MT-MMP 5 => MT-MMP5
MT-MMP 6 => MT-MMP6
MetAP 1D => MetAP1D
N-HSST 2 => N-HSST2
N-HSST 4 => N-HSST4
NB1 GP => NB1GP
NBP 2 => NBP2
NDK 6 => NDK6
NDK 7 => NDK7
NEC 2 => NEC2
NMHC II-C => NMHCII-C
NMP 238 => NMP238
ODG6; GAMOS7 => ODG6;GAMOS7
P-PST 2 => P-PST2
PAP IB => PAPIB
PC I-NP => PCI-NP
PDPC 2 => PDPC2
PGM 3 => PGM3
PGP 9.5 => PGP9.5
PKA C-beta => PKAC-beta
PLC eta 1 => PLCeta1
PMM 1 => PMM1
PMM 2 => PMM2
PTH 2 => PTH2
Pol Mu => PolMu
REG III => REGIII
RIM 3 => RIM3
RIM 4 => RIM4
RP-A p14 => RP-Ap14
RP-A p32 => RP-Ap32
RP-A p34 => RP-Ap34
S152. LPFS2 => S152.LPFS2
S5AR 1 => S5AR1
SDH 2 => SDH2
SIII p110 => SIIIp110
SK 2 => SK2
SKCa 2 => SKCa2
SP-A1 beta => SP-A1beta
SP-A1 delta => SP-A1delta
SP-A1 epsilon => SP-A1epsilon
SP-A1 gamma => SP-A1gamma
SPK 2 => SPK2
SPT 3 => SPT3
ST3Gal III => ST3GalIII
ST3Gal V => ST3GalV
STRAD alpha => STRADalpha
SynCAM 2 => SynCAM2
TC II => TCII
TIL. LPRS5 => TIL.LPRS5
TPT 1 => TPT1
TS. LQT8 => TS.LQT8
TTCP 1 => TTCP1
UBCHBEN; UBC13 => UBCHBEN;UBC13
UDPGT 1-1 => UDPGT1-1
UDPGT 1-3 => UDPGT1-3
UDPGT 1-4 => UDPGT1-4
UDPGT 1-5 => UDPGT1-5
UDPGT 1-6 => UDPGT1-6
UDPGT 1-7 => UDPGT1-7
UDPGT 1-8 => UDPGT1-8
UDPGT 1-9 => UDPGT1-9
UDPGT 2B7 => UDPGT2B7
UDPGT 2B8 => UDPGT2B8
UDPGT 2B9 => UDPGT2B9
UP 1 => UP1
ara CALC => araCALC
beta-1,3-GalTase 5 => beta-1,3-GalTase5
bic-D 1 => bic-D1
cGK 1 => cGK1
cam-PDE 1C => cam-PDE1C
eIF-4G 3 => eIF-4G3
eIF4G 3 => eIF4G3
eMDC II => eMDCII
glcNAc-T V => glcNAc-TV
hDH V => hDHV
hnRNP A1 => hnRNPA1
hnRNP M => hnRNPM
hnRNP U => hnRNPU
leIF G => leIFG
locus 4010 => locus4010
mEF-G 2 => mEF-G2
mPA-PLA1 beta => mPA-PLA1beta
nRap GEP => nRapGEP
p alpha => palpha
p30 DBC => p30DBC
p59 OASL => p59OASL
p70 S6KA => p70S6KA
p84 PIKAP => p84PIKAP
49 changes: 49 additions & 0 deletions src/test/java/org/neo4j/contrib/analyzers/SynonymAnalyzerTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package org.neo4j.contrib.analyzers;

import org.apache.lucene.analysis.Analyzer;
import org.junit.Rule;
import org.junit.Test;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.index.fulltext.AnalyzerProvider;
import org.neo4j.harness.junit.Neo4jRule;
import org.neo4j.helpers.collection.Iterators;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.contains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.neo4j.contrib.analyzers.SynonymAnalyzerProvider.ANALYZER_NAME;
import static org.neo4j.contrib.analyzers.SynonymAnalyzerProvider.DESCRIPTION;

public class SynonymAnalyzerTest {

@Rule
public Neo4jRule neo4j = new Neo4jRule();

@Test
public void checkTokenStream() {
AnalyzerProvider provider = AnalyzerProvider.getProviderByName(ANALYZER_NAME);
assertNotNull(provider);
assertEquals(DESCRIPTION, provider.description());
Analyzer analzyer = provider.createAnalyzer();
assertThat(TestUtils.analyze("CAN Open Bus Logo", analzyer), contains("can", "open", "bus", "logo"));
assertThat(TestUtils.analyze("_Modbus", analzyer), contains("_modbus"));
assertThat(TestUtils.analyze("abc x-270°", analzyer), contains("abc", "x-270°"));
assertThat(TestUtils.analyze("cGK 1 is a gene symbol", analzyer), contains("cgk1", "gene", "symbol"));
}

@Test
public void checkAnalyzerIsAvailable() {
TestUtils.checkForAnalyzer(neo4j.getGraphDatabaseService(), ANALYZER_NAME, DESCRIPTION);
}

@Test
public void checkSearchForTermContainingDash() {
GraphDatabaseService db = neo4j.getGraphDatabaseService();
db.execute("CALL db.index.fulltext.createNodeIndex('myIndex', ['Article'], ['title'], {analyzer: 'synonym'})");
db.execute( "CREATE (:Article{title:'abc x-270°'})");

assertEquals(1, Iterators.count(db.execute("CALL db.index.fulltext.queryNodes('myIndex', 'x\\\\-270°') yield node, score return node.title as text, score")));
assertEquals(0, Iterators.count(db.execute("CALL db.index.fulltext.queryNodes('myIndex', '\\\\-270°') yield node, score return node.title as text, score")));
}
}
2 changes: 2 additions & 0 deletions src/test/java/org/neo4j/contrib/analyzers/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@

public class TestUtils {
static List<String> analyze(String text, Analyzer analyzer) {
System.out.println("analyzing: " + text);
try {
List<String> result = new ArrayList<String>();
TokenStream tokenStream = analyzer.tokenStream("dummy", text);
CharTermAttribute attr = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(attr.toString());
System.out.println("term: " + attr.toString());
}
tokenStream.close();
return result;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import static org.hamcrest.Matchers.contains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.neo4j.contrib.analyzers.WhitespaceLowerAnalyzerProvider.ANALYZER_NAME;

public class WhitespaceLowerTest {

Expand All @@ -21,7 +22,7 @@ public class WhitespaceLowerTest {

@Test
public void checkTokenStream() {
AnalyzerProvider provider = AnalyzerProvider.getProviderByName("whitespace_lower");
AnalyzerProvider provider = AnalyzerProvider.getProviderByName(ANALYZER_NAME);
assertNotNull(provider);
assertEquals(WhitespaceLowerAnalyzerProvider.DESCRIPTION, provider.description());
Analyzer analzyer = provider.createAnalyzer();
Expand All @@ -35,7 +36,7 @@ public void checkTokenStream() {

@Test
public void checkAnalyzerIsAvailable() {
TestUtils.checkForAnalyzer(neo4j.getGraphDatabaseService(), "whitespace_lower", WhitespaceLowerAnalyzerProvider.DESCRIPTION);
TestUtils.checkForAnalyzer(neo4j.getGraphDatabaseService(), ANALYZER_NAME, WhitespaceLowerAnalyzerProvider.DESCRIPTION);
}

@Test
Expand Down

0 comments on commit 6bb0901

Please sign in to comment.