Skip to content

Commit

Permalink
use spaCy's PhraseMatcher to find word wise phrases
Browse files Browse the repository at this point in the history
also match lemma contains "'"
  • Loading branch information
xxyzz committed Sep 1, 2021
1 parent 987ce03 commit 0a6b8f1
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 22 deletions.
2 changes: 1 addition & 1 deletion __init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
from calibre.customize import InterfaceActionBase

VERSION = (3, 11, 2)
VERSION = (3, 12, 0)


class WordDumbDumb(InterfaceActionBase):
Expand Down
81 changes: 62 additions & 19 deletions parse_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import re

from calibre.constants import ismacos
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
from calibre.utils.logging import default_log
Expand All @@ -20,8 +21,13 @@ def do_job(data, create_ww=True, create_x=True,
if updata_asin:
set_asin(mi, asin, book_fmt, book_path)
model = lang['spacy'] + prefs['model_size']
install_libs(model, create_ww, create_x)
install_libs(model, create_ww)
is_kfx = book_fmt == 'KFX'
ll_conn = None
ll_path = None
lemmas = None
x_ray_conn = None
x_ray_path = None

if create_ww:
ll_conn, ll_path = create_lang_layer(asin, book_path, book_fmt)
Expand All @@ -34,21 +40,9 @@ def do_job(data, create_ww=True, create_x=True,
if x_ray_conn is None:
create_x = False

if create_x:
x_ray = X_Ray(x_ray_conn, lang['wiki'])
import spacy
nlp = spacy.load(model,
exclude=['tok2vec', 'morphologizer', 'tagger',
'parser', 'attribute_ruler', 'lemmatizer'])
nlp.enable_pipe("senter")

for doc, start in nlp.pipe(parse_book(book_path, is_kfx),
as_tuples=True):
find_named_entity(start, x_ray, doc, is_kfx)
if create_ww:
find_lemma(start, doc.text, lemmas, ll_conn, is_kfx)

x_ray.finish(x_ray_path)
if not ismacos:
spacy_job(create_ww, create_x, book_path, is_kfx, lemmas,
ll_conn, model, lang, x_ray_conn, x_ray_path)
elif create_ww:
for text, start in parse_book(book_path, is_kfx):
find_lemma(start, text, lemmas, ll_conn, is_kfx)
Expand All @@ -57,6 +51,38 @@ def do_job(data, create_ww=True, create_x=True,
save_db(ll_conn, ll_path)


def spacy_job(create_ww, create_x, book_path, is_kfx, lemmas,
ll_conn, model, lang, x_ray_conn, x_ray_path):
import spacy
if create_ww:
from spacy.matcher import PhraseMatcher
nlp_en = spacy.blank('en')
matcher = PhraseMatcher(nlp_en.vocab, attr='LOWER')
patterns = list(
nlp_en.pipe(
filter(lambda x: ' ' in x or '-' in x, lemmas.keys())))
matcher.add("phrases", patterns)
if not create_x:
for doc, start in nlp_en.pipe(
parse_book(book_path, is_kfx), as_tuples=True):
find_phrase_and_lemma(
doc, matcher, start, lemmas, ll_conn, is_kfx)
if create_x:
nlp = spacy.load(model, exclude=[
'tok2vec', 'morphologizer', 'tagger',
'parser', 'attribute_ruler', 'lemmatizer'])
nlp.enable_pipe("senter")
x_ray = X_Ray(x_ray_conn, lang['wiki'])
for doc, start in nlp.pipe(
parse_book(book_path, is_kfx), as_tuples=True):
find_named_entity(start, x_ray, doc, is_kfx)
if create_ww:
find_phrase_and_lemma(
doc, matcher, start, lemmas, ll_conn, is_kfx)

x_ray.finish(x_ray_path)


def parse_book(book_path, is_kfx):
if is_kfx:
yield from parse_kfx(book_path)
Expand Down Expand Up @@ -98,11 +124,28 @@ def parse_mobi(book_path):
match_text.start() + 1)


def find_lemma(start, text, lemmas, ll_conn, is_kfx):
def find_phrase_and_lemma(doc, matcher, start, lemmas, ll_conn, is_kfx):
ranges = set()
for span in matcher(doc, as_spans=True):
lemma = doc.text[span.start_char:span.end_char].lower()
if is_kfx:
index = start + span.start_char
else:
index = start + len(doc.text[:span.start_char].encode('utf-8'))
insert_lemma(ll_conn, (index,) + tuple(lemmas[lemma]))
ranges.add(span.start_char)
find_lemma(start, doc.text, lemmas, ll_conn, is_kfx, ranges)


def find_lemma(start, text, lemmas, ll_conn, is_kfx, ranges=None):
from nltk.corpus import wordnet as wn

for match in re.finditer(r'[a-zA-Z\u00AD]{3,}', text):
lemma = wn.morphy(match.group(0).replace('\u00AD', '').lower())
for match in re.finditer(r"[a-zA-Z'\u00AD]{3,}", text):
if ranges and match.start() in ranges:
continue
word = match.group(0).replace('\u00AD', '').lower() # rm soft hyphens
lemma = wn.morphy(word)
lemma = lemma if lemma else word
if lemma in lemmas:
if is_kfx:
index = start + match.start()
Expand Down
4 changes: 2 additions & 2 deletions unzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ def save_wiki_cache(cache_dic, lang):
json.dump(cache_dic, f)


def install_libs(model, create_ww=True, create_x=True):
def install_libs(model, create_ww):
pkgs = load_json('data/spacy.json')
if create_x:
if not ismacos:
for pkg, value in pkgs.items():
pip_install(pkg, value['version'], value['compiled'])
model_v = '3.1.0'
Expand Down

0 comments on commit 0a6b8f1

Please sign in to comment.