-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add synonym analyzer mapping gene symbols containing spaces
- Loading branch information
1 parent
2d8ec02
commit 6bb0901
Showing
7 changed files
with
305 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
src/main/java/org/neo4j/contrib/analyzers/SynonymAnalyzerProvider.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package org.neo4j.contrib.analyzers; | ||
|
||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.core.LowerCaseFilterFactory; | ||
import org.apache.lucene.analysis.core.StopFilterFactory; | ||
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory; | ||
import org.apache.lucene.analysis.custom.CustomAnalyzer; | ||
import org.apache.lucene.analysis.synonym.SynonymFilterFactory; | ||
import org.neo4j.graphdb.index.fulltext.AnalyzerProvider; | ||
import org.neo4j.helpers.Service; | ||
|
||
import java.io.IOException; | ||
|
||
@Service.Implementation(AnalyzerProvider.class) | ||
public class SynonymAnalyzerProvider extends AnalyzerProvider { | ||
|
||
public static final String DESCRIPTION = "analyzer using synonyms"; | ||
public static final String ANALYZER_NAME = "synonym"; | ||
|
||
public SynonymAnalyzerProvider() { | ||
super(ANALYZER_NAME, new String[0]); | ||
} | ||
|
||
public Analyzer createAnalyzer() { | ||
try { | ||
return CustomAnalyzer.builder() | ||
.withTokenizer(WhitespaceTokenizerFactory.class) | ||
.addTokenFilter(SynonymFilterFactory.class, "synonyms", "gene_symbols.txt", "ignoreCase", "true") | ||
.addTokenFilter(StopFilterFactory.class, "format", "snowball", "words", "org/apache/lucene/analysis/snowball/english_stop.txt,org/apache/lucene/analysis/snowball/german_stop.txt", "ignoreCase", "true") | ||
.addTokenFilter(LowerCaseFilterFactory.class) | ||
.build(); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
@Override | ||
public String description() { | ||
return DESCRIPTION; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
# this contains the gene symbol mappings, built with | ||
# echo "match (g:GeneSymbol:OmitInSearch) where g.sid contains ' ' return g.sid order by g.sid ;" | cypher-shell -a bolt://covid.petesis.com:7687 -u neo4j -p <pw> | sed -e 's/^"//' -e 's/"$//' | gawk -F '|' '{printf "%s => ", $1 } {gsub(" ", ""); print}' > ~/lib/neo4j/neo4j-additional-analyzers/src/main/resources/gene_symbols.txt | ||
# | ||
1-AGPAT 3 => 1-AGPAT3 | ||
1-AGPAT 6 => 1-AGPAT6 | ||
ADAM 21 => ADAM21 | ||
ADAM 22 => ADAM22 | ||
ADAM 28 => ADAM28 | ||
ADAM 7 => ADAM7 | ||
ADAM-TS 11 => ADAM-TS11 | ||
ADAM-TS 5 => ADAM-TS5 | ||
ADAM-TS 6 => ADAM-TS6 | ||
ADAM-TS 7 => ADAM-TS7 | ||
ADPRC 1 => ADPRC1 | ||
ADPRT 1 => ADPRT1 | ||
ADSS 2 => ADSS2 | ||
AFB1-AR 3 => AFB1-AR3 | ||
AGPAT 10 => AGPAT10 | ||
AK 4 => AK4 | ||
AK 7 => AK7 | ||
AK 8 => AK8 | ||
AK 9 => AK9 | ||
AKAP 110 => AKAP110 | ||
AKAP 82 => AKAP82 | ||
AKAP 95 => AKAP95 | ||
AKR1B10L, AK1R1B7 => AKR1B10L,AK1R1B7 | ||
AMan II => AManII | ||
ANT 1 => ANT1 | ||
ANT 2 => ANT2 | ||
ANT 3 => ANT3 | ||
ANT 4 => ANT4 | ||
AVPR V1a => AVPRV1a | ||
BCKDH E1-beta => BCKDHE1-beta | ||
C C CKR3 => CCCKR3 | ||
C-C CKR-6 => C-CCKR-6 | ||
CA 15-3 => CA15-3 | ||
CA-RP II => CA-RPII | ||
CAM-PDE 1A => CAM-PDE1A | ||
CCCAP SLSN7 => CCCAPSLSN7 | ||
CCX CKR => CCXCKR | ||
CDS 1 => CDS1 | ||
CGI-PDE A => CGI-PDEA | ||
CI-18 kDa => CI-18kDa | ||
CKI-gamma 3 => CKI-gamma3 | ||
CKR 3 => CKR3 | ||
COX IV-1 => COXIV-1 | ||
CaM KMT => CaMKMT | ||
CaMK IV => CaMKIV | ||
ColGalT 1 => ColGalT1 | ||
ColGalT 2 => ColGalT2 | ||
DC LAMP => DCLAMP | ||
DEE 2 => DEE2 | ||
DPP IX => DPPIX | ||
Dyna III => DynaIII | ||
E-NPP 7 => E-NPP7 | ||
EBSP, hNATL => EBSP,hNATL | ||
EKI 1 => EKI1 | ||
EPAC 2 => EPAC2 | ||
Exo V => ExoV | ||
F9 p22 => F9p22 | ||
GAP (1-12) => GAP(1-12) | ||
GAP (1-8) => GAP(1-8) | ||
GCAP 2 => GCAP2 | ||
GFAT 1 => GFAT1 | ||
GFAT 2 => GFAT2 | ||
GGT 1 => GGT1 | ||
GGT 2 => GGT2 | ||
GGT 5 => GGT5 | ||
GIIE sPLA2 => GIIEsPLA2 | ||
GMPR 1 => GMPR1 | ||
GMPR 2 => GMPR2 | ||
GPT 2 => GPT2 | ||
GST 13-13 => GST13-13 | ||
GSTO 1-1 => GSTO1-1 | ||
GSTO 2-2 => GSTO2-2 | ||
GTF2IRD2 alpha => GTF2IRD2alpha | ||
GnT I.2 => GnTI.2 | ||
H3-K9-HMTase 1 => H3-K9-HMTase1 | ||
HSF 2 => HSF2 | ||
HSF 5 => HSF5 | ||
HSP 75 => HSP75 | ||
HSTF 2 => HSTF2 | ||
HSTF 5 => HSTF5 | ||
IL-1 alpha => IL-1alpha | ||
IMP 3 => IMP3 | ||
IPS 1 => IPS1 | ||
IRE-BP 2 => IRE-BP2 | ||
ISG-54 K => ISG-54K | ||
K-Ras 2 => K-Ras2 | ||
LACS 3 => LACS3 | ||
LACS 6 => LACS6 | ||
LCB 3 => LCB3 | ||
LPLAT 1 => LPLAT1 | ||
LPLAT 2 => LPLAT2 | ||
LPLAT 5 => LPLAT5 | ||
LTB4-R 2 => LTB4-R2 | ||
LeIF F => LeIFF | ||
MAP 1D => MAP1D | ||
MAPK 12 => MAPK12 | ||
MAPK 13 => MAPK13 | ||
MAST 9 => MAST9 | ||
MCT 11 => MCT11 | ||
MCT 3 => MCT3 | ||
MCT 4 => MCT4 | ||
MCT 7 => MCT7 | ||
MCT 8 => MCT8 | ||
MEK 7 => MEK7 | ||
MEKK 1 => MEKK1 | ||
MEKK 4 => MEKK4 | ||
MEKKK 3 => MEKKK3 | ||
MIP-1 delta => MIP-1delta | ||
MLN 19 => MLN19 | ||
MPR 300 => MPR300 | ||
MPR 46 => MPR46 | ||
MT-MMP 1 => MT-MMP1 | ||
MT-MMP 5 => MT-MMP5 | ||
MT-MMP 6 => MT-MMP6 | ||
MetAP 1D => MetAP1D | ||
N-HSST 2 => N-HSST2 | ||
N-HSST 4 => N-HSST4 | ||
NB1 GP => NB1GP | ||
NBP 2 => NBP2 | ||
NDK 6 => NDK6 | ||
NDK 7 => NDK7 | ||
NEC 2 => NEC2 | ||
NMHC II-C => NMHCII-C | ||
NMP 238 => NMP238 | ||
ODG6; GAMOS7 => ODG6;GAMOS7 | ||
P-PST 2 => P-PST2 | ||
PAP IB => PAPIB | ||
PC I-NP => PCI-NP | ||
PDPC 2 => PDPC2 | ||
PGM 3 => PGM3 | ||
PGP 9.5 => PGP9.5 | ||
PKA C-beta => PKAC-beta | ||
PLC eta 1 => PLCeta1 | ||
PMM 1 => PMM1 | ||
PMM 2 => PMM2 | ||
PTH 2 => PTH2 | ||
Pol Mu => PolMu | ||
REG III => REGIII | ||
RIM 3 => RIM3 | ||
RIM 4 => RIM4 | ||
RP-A p14 => RP-Ap14 | ||
RP-A p32 => RP-Ap32 | ||
RP-A p34 => RP-Ap34 | ||
S152. LPFS2 => S152.LPFS2 | ||
S5AR 1 => S5AR1 | ||
SDH 2 => SDH2 | ||
SIII p110 => SIIIp110 | ||
SK 2 => SK2 | ||
SKCa 2 => SKCa2 | ||
SP-A1 beta => SP-A1beta | ||
SP-A1 delta => SP-A1delta | ||
SP-A1 epsilon => SP-A1epsilon | ||
SP-A1 gamma => SP-A1gamma | ||
SPK 2 => SPK2 | ||
SPT 3 => SPT3 | ||
ST3Gal III => ST3GalIII | ||
ST3Gal V => ST3GalV | ||
STRAD alpha => STRADalpha | ||
SynCAM 2 => SynCAM2 | ||
TC II => TCII | ||
TIL. LPRS5 => TIL.LPRS5 | ||
TPT 1 => TPT1 | ||
TS. LQT8 => TS.LQT8 | ||
TTCP 1 => TTCP1 | ||
UBCHBEN; UBC13 => UBCHBEN;UBC13 | ||
UDPGT 1-1 => UDPGT1-1 | ||
UDPGT 1-3 => UDPGT1-3 | ||
UDPGT 1-4 => UDPGT1-4 | ||
UDPGT 1-5 => UDPGT1-5 | ||
UDPGT 1-6 => UDPGT1-6 | ||
UDPGT 1-7 => UDPGT1-7 | ||
UDPGT 1-8 => UDPGT1-8 | ||
UDPGT 1-9 => UDPGT1-9 | ||
UDPGT 2B7 => UDPGT2B7 | ||
UDPGT 2B8 => UDPGT2B8 | ||
UDPGT 2B9 => UDPGT2B9 | ||
UP 1 => UP1 | ||
ara CALC => araCALC | ||
beta-1,3-GalTase 5 => beta-1,3-GalTase5 | ||
bic-D 1 => bic-D1 | ||
cGK 1 => cGK1 | ||
cam-PDE 1C => cam-PDE1C | ||
eIF-4G 3 => eIF-4G3 | ||
eIF4G 3 => eIF4G3 | ||
eMDC II => eMDCII | ||
glcNAc-T V => glcNAc-TV | ||
hDH V => hDHV | ||
hnRNP A1 => hnRNPA1 | ||
hnRNP M => hnRNPM | ||
hnRNP U => hnRNPU | ||
leIF G => leIFG | ||
locus 4010 => locus4010 | ||
mEF-G 2 => mEF-G2 | ||
mPA-PLA1 beta => mPA-PLA1beta | ||
nRap GEP => nRapGEP | ||
p alpha => palpha | ||
p30 DBC => p30DBC | ||
p59 OASL => p59OASL | ||
p70 S6KA => p70S6KA | ||
p84 PIKAP => p84PIKAP |
49 changes: 49 additions & 0 deletions
49
src/test/java/org/neo4j/contrib/analyzers/SynonymAnalyzerTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package org.neo4j.contrib.analyzers; | ||
|
||
import org.apache.lucene.analysis.Analyzer; | ||
import org.junit.Rule; | ||
import org.junit.Test; | ||
import org.neo4j.graphdb.GraphDatabaseService; | ||
import org.neo4j.graphdb.index.fulltext.AnalyzerProvider; | ||
import org.neo4j.harness.junit.Neo4jRule; | ||
import org.neo4j.helpers.collection.Iterators; | ||
|
||
import static org.hamcrest.MatcherAssert.assertThat; | ||
import static org.hamcrest.Matchers.contains; | ||
import static org.junit.Assert.assertEquals; | ||
import static org.junit.Assert.assertNotNull; | ||
import static org.neo4j.contrib.analyzers.SynonymAnalyzerProvider.ANALYZER_NAME; | ||
import static org.neo4j.contrib.analyzers.SynonymAnalyzerProvider.DESCRIPTION; | ||
|
||
public class SynonymAnalyzerTest { | ||
|
||
@Rule | ||
public Neo4jRule neo4j = new Neo4jRule(); | ||
|
||
@Test | ||
public void checkTokenStream() { | ||
AnalyzerProvider provider = AnalyzerProvider.getProviderByName(ANALYZER_NAME); | ||
assertNotNull(provider); | ||
assertEquals(DESCRIPTION, provider.description()); | ||
Analyzer analzyer = provider.createAnalyzer(); | ||
assertThat(TestUtils.analyze("CAN Open Bus Logo", analzyer), contains("can", "open", "bus", "logo")); | ||
assertThat(TestUtils.analyze("_Modbus", analzyer), contains("_modbus")); | ||
assertThat(TestUtils.analyze("abc x-270°", analzyer), contains("abc", "x-270°")); | ||
assertThat(TestUtils.analyze("cGK 1 is a gene symbol", analzyer), contains("cgk1", "gene", "symbol")); | ||
} | ||
|
||
@Test | ||
public void checkAnalyzerIsAvailable() { | ||
TestUtils.checkForAnalyzer(neo4j.getGraphDatabaseService(), ANALYZER_NAME, DESCRIPTION); | ||
} | ||
|
||
@Test | ||
public void checkSearchForTermContainingDash() { | ||
GraphDatabaseService db = neo4j.getGraphDatabaseService(); | ||
db.execute("CALL db.index.fulltext.createNodeIndex('myIndex', ['Article'], ['title'], {analyzer: 'synonym'})"); | ||
db.execute( "CREATE (:Article{title:'abc x-270°'})"); | ||
|
||
assertEquals(1, Iterators.count(db.execute("CALL db.index.fulltext.queryNodes('myIndex', 'x\\\\-270°') yield node, score return node.title as text, score"))); | ||
assertEquals(0, Iterators.count(db.execute("CALL db.index.fulltext.queryNodes('myIndex', '\\\\-270°') yield node, score return node.title as text, score"))); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters