Skip to content

Commit

Permalink
New services for SemTab2021
Browse files Browse the repository at this point in the history
  • Loading branch information
NoYo25 committed Oct 20, 2021
1 parent 8cb71b1 commit 94b72ba
Show file tree
Hide file tree
Showing 241 changed files with 6,429 additions and 1,914 deletions.
4 changes: 3 additions & 1 deletion services/Autocorrect/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@
.directory
__pycache__
/.idea/
*.log
*.log
tfidf_ngrams.py
group_fix.py
1 change: 1 addition & 0 deletions services/Autocorrect/.pytest_cache/v/cache/lastfailed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
3 changes: 3 additions & 0 deletions services/Autocorrect/.pytest_cache/v/cache/nodeids
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[
"main.py::test"
]
8 changes: 7 additions & 1 deletion services/Autocorrect/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
ENABLE_WIKIPEDIA_CORRECTION = False

# enable/disable off_the_shelf corrections
ENABLE_OFF_THE_SHELF_CORRECTIONS = True
ENABLE_OFF_THE_SHELF_CORRECTIONS = False

# enable/disable Wikidata nearest label corrections
ENABLE_WIKIDATA_BASED_CORRECTIONS = False
Expand Down Expand Up @@ -38,6 +38,12 @@
# internal_storage_config
CUR_PATH = os.path.dirname(os.path.abspath(__file__))
if os.environ.get('DOCKERIZED', False):
CACHE_PATH = os.path.join(CUR_PATH, 'cache')
ASSET_PATH = os.path.join(CUR_PATH, 'assets')
else:
CACHE_PATH = os.path.abspath(os.path.join(CUR_PATH, '..', '..', 'assets', 'data', 'cache', 'Autocorrect'))
ASSET_PATH = os.path.abspath(os.path.join(CUR_PATH, '..', '..', 'assets', 'Autocorrect'))

# make sure all paths exist
if not os.path.exists(CACHE_PATH):
os.makedirs(CACHE_PATH)
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import sqlite3
import config
from utils.util import get_batch

CACHE_DISABLED = os.environ.get('DISABLE_CACHE', False)

Expand Down Expand Up @@ -120,19 +121,21 @@ def getMany(self, needles):
else:
# single key: use one big query

# prep query
needles = [needle[self._keys[0]] for needle in needles]
params = ', '.join('(?)' for n in needles)
query = self._queryMultiSelect % params
seen = set()
for batch in get_batch(needles, 500):

# run query
db.execute(query, needles)
# prep query
params = ', '.join('(?)' for n in batch)
query = self._queryMultiSelect % params

# parse results
seen = set()
for row in db:
result['hits'].append({'key': {self._keys[0]: row[0]}, 'val': json.loads(row[1])})
seen.add(row[0])
# run query
db.execute(query, batch)

# parse results
for row in db:
result['hits'].append({'key': {self._keys[0]: row[0]}, 'val': json.loads(row[1])})
seen.add(row[0])

# add unmatched
result['misses'] = [{self._keys[0]: needle} for needle in needles if needle not in seen]
Expand Down Expand Up @@ -174,8 +177,6 @@ def setMany(self, items):
if 'val' not in item:
raise Exception('Missing value in item')
if any(k not in item['key'] for k in self._keys):
print(item)
print(self._keys)
raise Exception('Missing field in key')

# store in db
Expand Down
8 changes: 6 additions & 2 deletions services/Autocorrect/inc/off_the_shelf_correction.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from autocorrect import Speller

import inc.cache

spell = Speller()

# cache of results
cache = inc.cache.Cache('terms', ['term'])

def correct(word):
return spell(word)
res = spell(word)
cache.set({'term': word}, [res])
return res
7 changes: 6 additions & 1 deletion services/Autocorrect/inc/w2v_correction.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import re
import config
from utils.vec_manager import *
import inc.cache

# cache of results
cache = inc.cache.Cache('terms', ['term'])

class W2VCorrection():
def __init__(self, model1):
Expand All @@ -26,7 +29,9 @@ def correct_word(self, word):
lst = lst + [word]
else:
lst = lst + [self.correction(word)]
return "{}".format(config.CLEAN_CELL_SEPARATOR).join(lst)
res = "{}".format(config.CLEAN_CELL_SEPARATOR).join(lst)
cache.set({'term': word}, [res])
return res

# Methods related to correction probability
def words(self, text):
Expand Down
56 changes: 27 additions & 29 deletions services/Autocorrect/inc/wikidata_nearest_lbl_correction.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ftfy import fix_text
from config import ASSET_PATH


def load_file(file_object, chunk_size=chunk_size, lazy=True):
"""
Lazy function (generator) to read a file chunk by chunk
Expand Down Expand Up @@ -50,7 +51,6 @@ def get_nearest_candidate(chunk, values):

# list of labels
labels = [l for l in chunk.split('\n')] # filter on real strings only
print(len(labels))

# init threads pool
pool = multiprocessing.Pool(processes=cpus)
Expand Down Expand Up @@ -124,33 +124,33 @@ def get_candidates(vals):

def test():
# Court cases from table 3WJT10EJ.csv 2020 R2
vals = ['Spaziano . Florida', \
'Smith v/ Maryland', \
'SEC v. Texas Gulf Sumphur Co.', \
'Reieer v. Thompso', \
'Reed v. Pennsylvania Railroad Compan|', \
'Building Service Employees International Union Local 262 v/ Gazzam', \
'Ramspeck v. Federal Trial Exainers Conference', \
'Cowma Dairy Company v. United States', \
'Noswood v. Kirkpatrick', \
'Mongomery Building & Construction Trades Council v. Ledbetter Erection Company', \
'Southern Pacfic Company v. Gileo', \
'Colgate-Palmolive-Peft Company v. National Labor Relations Board', \
'Unitee States v. United States Smelting Refining', \
vals = ['Spaziano . Florida',
'Smith v/ Maryland',
'SEC v. Texas Gulf Sumphur Co.',
'Reieer v. Thompso',
'Reed v. Pennsylvania Railroad Compan|',
'Building Service Employees International Union Local 262 v/ Gazzam',
'Ramspeck v. Federal Trial Exainers Conference',
'Cowma Dairy Company v. United States',
'Noswood v. Kirkpatrick',
'Mongomery Building & Construction Trades Council v. Ledbetter Erection Company',
'Southern Pacfic Company v. Gileo',
'Colgate-Palmolive-Peft Company v. National Labor Relations Board',
'Unitee States v. United States Smelting Refining',
'Poizzi v. Cowles Magazies']
expected = ['Spaziano v. Florida', \
'Smith v. Maryland', \
'SEC v. Texas Gulf Sulphur Co', \
'Reider v. Thompson ', \
'Reed v. Pennsylvania Railroad Company', \
'Building Service Employees International Union Local 262 v. Gazzam', \
'ramspeck v. federal trial examiners conference', \
'Bowman Dairy Company v. United States', \
'Norwood v. Kirkpatrick', \
'Montgomery Building & Construction Trades Council v. Ledbetter Erection Company', \
'Southern Pacific Company v. Gileo', \
'Colgate-Palmolive-Peet Company v. National Labor Relations Board', \
'United States v. United States Smelting Refining', \
expected = ['Spaziano v. Florida',
'Smith v. Maryland',
'SEC v. Texas Gulf Sulphur Co',
'Reider v. Thompson ',
'Reed v. Pennsylvania Railroad Company',
'Building Service Employees International Union Local 262 v. Gazzam',
'ramspeck v. federal trial examiners conference',
'Bowman Dairy Company v. United States',
'Norwood v. Kirkpatrick',
'Montgomery Building & Construction Trades Council v. Ledbetter Erection Company',
'Southern Pacific Company v. Gileo',
'Colgate-Palmolive-Peet Company v. National Labor Relations Board',
'United States v. United States Smelting Refining',
'Polizzi v. Cowles Magazines']

util_log.start("test")
Expand All @@ -161,10 +161,8 @@ def test():
cnt = 0
for val, exp in zip(vals, expected):
util_log.info("'{}' is corrected as --> '{}'".format(res[val], expected))
print(res[val])
if res[val].lower() == exp.lower(): # normalize case insensitive
cnt = cnt + 1
print((cnt / len(vals)) * 100)
util_log.stop("test")


Expand Down
23 changes: 19 additions & 4 deletions services/Autocorrect/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import traceback

from utils.vec_manager import *
from solver import get_corrected_lst, get_corrected
from solver import get_corrected_lst, get_corrected, get_knowns_words

from utils import util_log

Expand All @@ -24,6 +24,18 @@ def __load_model():
print('Model loaded ... ')


@app.route('/get_knowns', methods=['POST'])
def get_knowns():
"""Auto-correct a list of string values"""
global model

# List of strings
lst = request.json["values"]

knowns = get_knowns_words(lst, model)
return knowns


@app.route('/correct_cell_lst', methods=['POST'])
def correct_cell_lst():
"""Auto-correct a list of string values"""
Expand All @@ -35,6 +47,7 @@ def correct_cell_lst():
corrected = get_corrected_lst(col_cells, model)
return corrected


@app.route('/correct_cell', methods=['POST'])
def correct_cell():
"""Auto-correct a list of string values"""
Expand All @@ -46,12 +59,14 @@ def correct_cell():
corrected = get_corrected(cell, model)
return corrected


@app.route('/test')
def test():
global model
lst = ["Rashmon", "Leo?n"]
return get_corrected_lst(lst, model)

# lst = ["Rashmon", "Leo?n", "Massachussetts"]
# return get_corrected_lst(lst, model)
lst = ['aspirine', 'Aspirin', 'aspin', 'asperin']
return get_knowns_words(lst, model)

@app.errorhandler(InternalServerError)
def handle_500(e):
Expand Down
2 changes: 1 addition & 1 deletion services/Autocorrect/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ ftfy==5.7
gensim==3.8.3
gunicorn==20.0.4
autocorrect==2.5.0
edlib
edlib
65 changes: 50 additions & 15 deletions services/Autocorrect/solver.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,72 @@
from inc.w2v_correction import W2VCorrection
from inc.off_the_shelf_correction import correct
import config
import inc.cache

# cache of results
cache = inc.cache.Cache('terms', ['term'])


def get_knowns_words(list_values, model):
VOCAB = model.index2word

lst = []

for word in list_values:
words = word.split(config.CLEAN_CELL_SEPARATOR)
unknown = [w for w in words if w not in VOCAB]
if not unknown:
lst.append(word)

return {'knowns': lst}


def get_corrected_lst(list_values, model):
"""
Core method handle all types of auto corrections
TODO: support cache file per auto_correct mechanisim
:param list_values: list of strings to be corrected
:param model: w2v model or None if disable w2v autocorrection
:return: dict with 'auto_correct': ['fix1', 'fix2', ...]
"""
unique_vals_dict = {}
unique_vals = list(set(list_values))
# init dict?
for v in unique_vals:
unique_vals_dict.update({v: v})

# too many items, so we shorten them here.
if len(unique_vals) > config.AUTOCORRECT_MAX:
unique_vals = unique_vals[0: config.AUTOCORRECT_MAX]

# model based corrections
if config.ENABLE_MODELBASED_CORRECTIONS:
# W2V corrections
w2vCorrect = W2VCorrection(model)
[unique_vals_dict.update({w: w2vCorrect.correct_word(w)}) for w in unique_vals]
elif config.ENABLE_OFF_THE_SHELF_CORRECTIONS:
# Of-the-self auto-correction
[unique_vals_dict.update({w: correct(w)}) for w in unique_vals]
elif config.ENABLE_WIKIPEDIA_CORRECTION:
# TODO: restructure inc.wikipedia_correction and use it here
pass
elif config.ENABLE_WIKIDATA_BASED_CORRECTIONS:
# TODO: restructure inc.wikidata_nearest_lbl_correction and use it here
pass
# check, if there is something in the cache
cached = cache.getMany([{'term': term} for term in unique_vals])

for r in cached['hits']:
print("Cached autocorrect hits")
unique_vals_dict.update({r['key']['term']: r['val'][0]})

missed_terms = []
# fire real requests if we have any misses
if cached['misses']:
# run lookups for all terms in parallel
missed_terms = [t['term'] for t in cached['misses']]

# Apply the actual correction on the missed terms only
if missed_terms:
# model based corrections
if config.ENABLE_MODELBASED_CORRECTIONS:
# W2V corrections
w2vCorrect = W2VCorrection(model)
[unique_vals_dict.update({w: w2vCorrect.correct_word(w)}) for w in missed_terms]
elif config.ENABLE_OFF_THE_SHELF_CORRECTIONS:
# Of-the-self auto-correction
[unique_vals_dict.update({w: correct(w)}) for w in missed_terms]
elif config.ENABLE_WIKIPEDIA_CORRECTION:
# TODO: restructure inc.wikipedia_correction and use it here
pass
elif config.ENABLE_WIKIDATA_BASED_CORRECTIONS:
# TODO: restructure inc.wikidata_nearest_lbl_correction and use it here
pass

# expand to the original dimension
autocorrected_lst = [unique_vals_dict[w] for w in list_values]
Expand Down
File renamed without changes.
6 changes: 6 additions & 0 deletions services/Autocorrect/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@ def chunks(l, n):
for i in range(0, len(l), n):
# Create an index range for l of n items:
yield l[i:i + n]

def get_batch(l, n):
# For item i in a range that is a length of l,
for i in range(0, len(l), n):
# Create an index range for l of n items:
yield l[i:i + n]

This file was deleted.

Loading

0 comments on commit 94b72ba

Please sign in to comment.