diff --git a/.travis.yml b/.travis.yml index fe5fa6d..9a2c719 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,8 @@ language: python python: - - "2.7" + - 3.6 + - latest virtualenv: system_site_packages: true diff --git a/quantulum/classes.py b/quantulum/classes.py index 65cdb66..37077b4 100644 --- a/quantulum/classes.py +++ b/quantulum/classes.py @@ -1,95 +1,95 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""quantulum classes.""" - - -############################################################################### -class Quantity(object): - """Class for a quantity (e.g. "4.2 gallons").""" - - def __init__(self, value=None, unit=None, surface=None, span=None, - uncertainty=None): - """Initialization method.""" - self.value = value - self.unit = unit - self.surface = surface - self.span = span - self.uncertainty = uncertainty - - def __repr__(self): - """Representation method.""" - msg = u'Quantity(%g, "%s")' - msg = msg % (self.value, self.unit.name) - return msg.encode('utf-8') - - def __eq__(self, other): - """Equality method.""" - if isinstance(other, self.__class__): - return self.__dict__ == other.__dict__ - else: - return False - - def __ne__(self, other): - """Non equality method.""" - return not self.__eq__(other) - - -############################################################################### -class Unit(object): - """Class for a unit (e.g. "gallon").""" - - def __init__(self, name=None, surfaces=None, entity=None, uri=None, - symbols=None, dimensions=None): - """Initialization method.""" - self.name = name - self.surfaces = surfaces - self.entity = entity - self.uri = uri - self.symbols = symbols - self.dimensions = dimensions - - def __repr__(self): - """Representation method.""" - msg = u'Unit(name="%s", entity=Entity("%s"), uri=%s)' - msg = msg % (self.name, self.entity.name, self.uri) - return msg.encode('utf-8') - - def __eq__(self, other): - """Equality method.""" - if isinstance(other, self.__class__): - return self.__dict__ == other.__dict__ - else: - return False - - def __ne__(self, other): - """Non equality method.""" - return not self.__eq__(other) - - -############################################################################### -class Entity(object): - """Class for an entity (e.g. "volume").""" - - def __init__(self, name=None, dimensions=None, uri=None): - """Initialization method.""" - self.name = name - self.dimensions = dimensions - self.uri = uri - - def __repr__(self): - """Representation method.""" - msg = u'Entity(name="%s", uri=%s)' - msg = msg % (self.name, self.uri) - return msg.encode('utf-8') - - def __eq__(self, other): - """Equality method.""" - if isinstance(other, self.__class__): - return self.__dict__ == other.__dict__ - else: - return False - - def __ne__(self, other): - """Non equality method.""" - return not self.__eq__(other) +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""quantulum classes.""" + + +############################################################################### +class Quantity(object): + """Class for a quantity (e.g. "4.2 gallons").""" + + def __init__(self, value=None, unit=None, surface=None, span=None, + uncertainty=None): + """Initialization method.""" + self.value = value + self.unit = unit + self.surface = surface + self.span = span + self.uncertainty = uncertainty + + def __repr__(self): + """Representation method.""" + msg = 'Quantity(%g, "%s")' + msg = msg % (self.value, self.unit.name) + return msg + + def __eq__(self, other): + """Equality method.""" + if isinstance(other, self.__class__): + return self.__dict__ == other.__dict__ + else: + return False + + def __ne__(self, other): + """Non equality method.""" + return not self.__eq__(other) + + +############################################################################### +class Unit(object): + """Class for a unit (e.g. "gallon").""" + + def __init__(self, name=None, surfaces=None, entity=None, uri=None, + symbols=None, dimensions=None): + """Initialization method.""" + self.name = name + self.surfaces = surfaces + self.entity = entity + self.uri = uri + self.symbols = symbols + self.dimensions = dimensions + + def __repr__(self): + """Representation method.""" + msg = 'Unit(name="%s", entity=Entity("%s"), uri=%s)' + msg = msg % (self.name, self.entity.name, self.uri) + return msg + + def __eq__(self, other): + """Equality method.""" + if isinstance(other, self.__class__): + return self.__dict__ == other.__dict__ + else: + return False + + def __ne__(self, other): + """Non equality method.""" + return not self.__eq__(other) + + +############################################################################### +class Entity(object): + """Class for an entity (e.g. "volume").""" + + def __init__(self, name=None, dimensions=None, uri=None): + """Initialization method.""" + self.name = name + self.dimensions = dimensions + self.uri = uri + + def __repr__(self): + """Representation method.""" + msg = 'Entity(name="%s", uri=%s)' + msg = msg % (self.name, self.uri) + return msg + + def __eq__(self, other): + """Equality method.""" + if isinstance(other, self.__class__): + return self.__dict__ == other.__dict__ + else: + return False + + def __ne__(self, other): + """Non equality method.""" + return not self.__eq__(other) diff --git a/quantulum/classifier.py b/quantulum/classifier.py index 41f03ee..444ba20 100644 --- a/quantulum/classifier.py +++ b/quantulum/classifier.py @@ -1,167 +1,167 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""quantulum classifier functions.""" - -# Standard library -import re -import os -import json -import pickle -import logging - -# Dependencies -import wikipedia -from stemming.porter2 import stem -try: - from sklearn.linear_model import SGDClassifier - from sklearn.feature_extraction.text import TfidfVectorizer - USE_CLF = True -except ImportError: - USE_CLF = False - -# Quantulum -from . import load as l - - -############################################################################### -def download_wiki(): - """Download WikiPedia pages of ambiguous units.""" - ambiguous = [i for i in l.UNITS.items() if len(i[1]) > 1] - ambiguous += [i for i in l.DERIVED_ENT.items() if len(i[1]) > 1] - pages = set([(j.name, j.uri) for i in ambiguous for j in i[1]]) - - print - objs = [] - for num, page in enumerate(pages): - - obj = {'url': page[1]} - obj['_id'] = obj['url'].replace('https://en.wikipedia.org/wiki/', '') - obj['clean'] = obj['_id'].replace('_', ' ') - - print '---> Downloading %s (%d of %d)' % \ - (obj['clean'], num + 1, len(pages)) - - obj['text'] = wikipedia.page(obj['clean']).content - obj['unit'] = page[0] - objs.append(obj) - - path = os.path.join(l.TOPDIR, 'wiki.json') - os.remove(path) - json.dump(objs, open(path, 'w'), indent=4, sort_keys=True) - - print '\n---> All done.\n' - - -############################################################################### -def clean_text(text): - """Clean text for TFIDF.""" - new_text = re.sub(ur'\p{P}+', ' ', text) - - new_text = [stem(i) for i in new_text.lower().split() if not - re.findall(r'[0-9]', i)] - - new_text = ' '.join(new_text) - - return new_text - - -############################################################################### -def train_classifier(download=True, parameters=None, ngram_range=(1, 1)): - """Train the intent classifier.""" - if download: - download_wiki() - - path = os.path.join(l.TOPDIR, 'train.json') - training_set = json.load(open(path)) - path = os.path.join(l.TOPDIR, 'wiki.json') - wiki_set = json.load(open(path)) - - target_names = list(set([i['unit'] for i in training_set + wiki_set])) - train_data, train_target = [], [] - for example in training_set + wiki_set: - train_data.append(clean_text(example['text'])) - train_target.append(target_names.index(example['unit'])) - - tfidf_model = TfidfVectorizer(sublinear_tf=True, - ngram_range=ngram_range, - stop_words='english') - - matrix = tfidf_model.fit_transform(train_data) - - if parameters is None: - parameters = {'loss': 'log', 'penalty': 'l2', 'n_iter': 50, - 'alpha': 0.00001, 'fit_intercept': True} - - clf = SGDClassifier(**parameters).fit(matrix, train_target) - obj = {'tfidf_model': tfidf_model, - 'clf': clf, - 'target_names': target_names} - path = os.path.join(l.TOPDIR, 'clf.pickle') - pickle.dump(obj, open(path, 'w')) - - -############################################################################### -def load_classifier(): - """Train the intent classifier.""" - path = os.path.join(l.TOPDIR, 'clf.pickle') - obj = pickle.load(open(path, 'r')) - - return obj['tfidf_model'], obj['clf'], obj['target_names'] - -if USE_CLF: - TFIDF_MODEL, CLF, TARGET_NAMES = load_classifier() -else: - TFIDF_MODEL, CLF, TARGET_NAMES = None, None, None - - -############################################################################### -def disambiguate_entity(key, text): - """Resolve ambiguity between entities with same dimensionality.""" - new_ent = l.DERIVED_ENT[key][0] - - if len(l.DERIVED_ENT[key]) > 1: - transformed = TFIDF_MODEL.transform([text]) - scores = CLF.predict_proba(transformed).tolist()[0] - scores = sorted(zip(scores, TARGET_NAMES), key=lambda x: x[0], - reverse=True) - names = [i.name for i in l.DERIVED_ENT[key]] - scores = [i for i in scores if i[1] in names] - try: - new_ent = l.ENTITIES[scores[0][1]] - except IndexError: - logging.debug('\tAmbiguity not resolved for "%s"', str(key)) - - return new_ent - - -############################################################################### -def disambiguate_unit(unit, text): - """ - Resolve ambiguity. - - Distinguish between units that have same names, symbols or abbreviations. - """ - new_unit = l.UNITS[unit] - if not new_unit: - new_unit = l.LOWER_UNITS[unit.lower()] - if not new_unit: - raise KeyError('Could not find unit "%s"' % unit) - - if len(new_unit) > 1: - transformed = TFIDF_MODEL.transform([clean_text(text)]) - scores = CLF.predict_proba(transformed).tolist()[0] - scores = sorted(zip(scores, TARGET_NAMES), key=lambda x: x[0], - reverse=True) - names = [i.name for i in new_unit] - scores = [i for i in scores if i[1] in names] - try: - final = l.UNITS[scores[0][1]][0] - logging.debug('\tAmbiguity resolved for "%s" (%s)', unit, scores) - except IndexError: - logging.debug('\tAmbiguity not resolved for "%s"', unit) - final = new_unit[0] - else: - final = new_unit[0] - - return final +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""quantulum classifier functions.""" + +# Standard library +import re +import os +import json +import pickle +import logging + +# Dependencies +import wikipedia +from stemming.porter2 import stem +try: + from sklearn.linear_model import SGDClassifier + from sklearn.feature_extraction.text import TfidfVectorizer + USE_CLF = True +except ImportError: + USE_CLF = False + +# Quantulum +from . import load as l + + +############################################################################### +def download_wiki(): + """Download WikiPedia pages of ambiguous units.""" + ambiguous = [i for i in list(l.UNITS.items()) if len(i[1]) > 1] + ambiguous += [i for i in list(l.DERIVED_ENT.items()) if len(i[1]) > 1] + pages = set([(j.name, j.uri) for i in ambiguous for j in i[1]]) + + print() + objs = [] + for num, page in enumerate(pages): + + obj = {'url': page[1]} + obj['_id'] = obj['url'].replace('https://en.wikipedia.org/wiki/', '') + obj['clean'] = obj['_id'].replace('_', ' ') + + print('---> Downloading %s (%d of %d)' % \ + (obj['clean'], num + 1, len(pages))) + + obj['text'] = wikipedia.page(obj['clean']).content + obj['unit'] = page[0] + objs.append(obj) + + path = os.path.join(l.TOPDIR, 'wiki.json') + os.remove(path) + json.dump(objs, open(path, 'w'), indent=4, sort_keys=True) + + print('\n---> All done.\n') + + +############################################################################### +def clean_text(text): + """Clean text for TFIDF.""" + new_text = re.sub(r'\p{P}+', ' ', text) + + new_text = [stem(i) for i in new_text.lower().split() if not + re.findall(r'[0-9]', i)] + + new_text = ' '.join(new_text) + + return new_text + + +############################################################################### +def train_classifier(download=True, parameters=None, ngram_range=(1, 1)): + """Train the intent classifier.""" + if download: + download_wiki() + + path = os.path.join(l.TOPDIR, 'train.json') + training_set = json.load(open(path)) + path = os.path.join(l.TOPDIR, 'wiki.json') + wiki_set = json.load(open(path)) + + target_names = list(set([i['unit'] for i in training_set + wiki_set])) + train_data, train_target = [], [] + for example in training_set + wiki_set: + train_data.append(clean_text(example['text'])) + train_target.append(target_names.index(example['unit'])) + + tfidf_model = TfidfVectorizer(sublinear_tf=True, + ngram_range=ngram_range, + stop_words='english') + + matrix = tfidf_model.fit_transform(train_data) + + if parameters is None: + parameters = {'loss': 'log', 'penalty': 'l2', 'n_iter': 50, + 'alpha': 0.00001, 'fit_intercept': True} + + clf = SGDClassifier(**parameters).fit(matrix, train_target) + obj = {'tfidf_model': tfidf_model, + 'clf': clf, + 'target_names': target_names} + path = os.path.join(l.TOPDIR, 'clf.pickle') + pickle.dump(obj, open(path, 'w')) + + +############################################################################### +def load_classifier(): + """Train the intent classifier.""" + path = os.path.join(l.TOPDIR, 'clf.pickle') + obj = pickle.load(open(path, 'r')) + + return obj['tfidf_model'], obj['clf'], obj['target_names'] + +if USE_CLF: + TFIDF_MODEL, CLF, TARGET_NAMES = load_classifier() +else: + TFIDF_MODEL, CLF, TARGET_NAMES = None, None, None + + +############################################################################### +def disambiguate_entity(key, text): + """Resolve ambiguity between entities with same dimensionality.""" + new_ent = l.DERIVED_ENT[key][0] + + if len(l.DERIVED_ENT[key]) > 1: + transformed = TFIDF_MODEL.transform([text]) + scores = CLF.predict_proba(transformed).tolist()[0] + scores = sorted(zip(scores, TARGET_NAMES), key=lambda x: x[0], + reverse=True) + names = [i.name for i in l.DERIVED_ENT[key]] + scores = [i for i in scores if i[1] in names] + try: + new_ent = l.ENTITIES[scores[0][1]] + except IndexError: + logging.debug('\tAmbiguity not resolved for "%s"', str(key)) + + return new_ent + + +############################################################################### +def disambiguate_unit(unit, text): + """ + Resolve ambiguity. + + Distinguish between units that have same names, symbols or abbreviations. + """ + new_unit = l.UNITS[unit] + if not new_unit: + new_unit = l.LOWER_UNITS[unit.lower()] + if not new_unit: + raise KeyError('Could not find unit "%s"' % unit) + + if len(new_unit) > 1: + transformed = TFIDF_MODEL.transform([clean_text(text)]) + scores = CLF.predict_proba(transformed).tolist()[0] + scores = sorted(zip(scores, TARGET_NAMES), key=lambda x: x[0], + reverse=True) + names = [i.name for i in new_unit] + scores = [i for i in scores if i[1] in names] + try: + final = l.UNITS[scores[0][1]][0] + logging.debug('\tAmbiguity resolved for "%s" (%s)', unit, scores) + except IndexError: + logging.debug('\tAmbiguity not resolved for "%s"', unit) + final = new_unit[0] + else: + final = new_unit[0] + + return final diff --git a/quantulum/load.py b/quantulum/load.py index f761f30..e8779a6 100644 --- a/quantulum/load.py +++ b/quantulum/load.py @@ -1,162 +1,162 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""quantulum unit and entity loading functions.""" - -# Standard library -import os -import json -from collections import defaultdict - -# Dependencies -import inflect - -# Quantulum -from . import classes as c - -TOPDIR = os.path.dirname(__file__) or "." - -PLURALS = inflect.engine() - - -############################################################################### -def get_key_from_dimensions(dimensions): - """ - Get a key for DERIVED_UNI or DERIVED_ENT. - - Translate dimensionality into key for DERIVED_UNI and DERIVED_ENT - dictionaries. - """ - return tuple(tuple(i.items()) for i in dimensions) - - -############################################################################### -def get_dimension_permutations(entities, dimensions): - """Get all possible dimensional definitions for an entity.""" - new_dimensions = defaultdict(int) - for item in dimensions: - new = entities[item['base']].dimensions - if new: - for new_item in new: - new_dimensions[new_item['base']] += new_item['power'] * \ - item['power'] - else: - new_dimensions[item['base']] += item['power'] - - final = [[{'base': i[0], 'power': i[1]} for i in new_dimensions.items()]] - final.append(dimensions) - final = [sorted(i, key=lambda x: x['base']) for i in final] - - candidates = [] - for item in final: - if item not in candidates: - candidates.append(item) - - return candidates - - -############################################################################### -def load_entities(): - """Load entities from JSON file.""" - path = os.path.join(TOPDIR, 'entities.json') - entities = json.load(open(path)) - names = [i['name'] for i in entities] - - try: - assert len(set(names)) == len(entities) - except AssertionError: - raise Exception('Entities with same name: %s' % [i for i in names if - names.count(i) > 1]) - - entities = dict((k['name'], c.Entity(name=k['name'], - dimensions=k['dimensions'], - uri=k['URI'])) for k in entities) - - dimensions_ent = defaultdict(list) - for ent in entities: - if not entities[ent].dimensions: - continue - perms = get_dimension_permutations(entities, entities[ent].dimensions) - for perm in perms: - key = get_key_from_dimensions(perm) - dimensions_ent[key].append(entities[ent]) - - return entities, dimensions_ent - -ENTITIES, DERIVED_ENT = load_entities() - - -############################################################################### -def get_dimensions_units(names): - """Create dictionary of unit dimensions.""" - dimensions_uni = {} - - for name in names: - - key = get_key_from_dimensions(names[name].dimensions) - dimensions_uni[key] = names[name] - plain_dimensions = [{'base': name, 'power': 1}] - key = get_key_from_dimensions(plain_dimensions) - dimensions_uni[key] = names[name] - - if not names[name].dimensions: - names[name].dimensions = plain_dimensions - - names[name].dimensions = [{'base': names[i['base']].name, - 'power': i['power']} for i in - names[name].dimensions] - - return dimensions_uni - - -############################################################################### -def load_units(): - """Load units from JSON file.""" - names = {} - lowers = defaultdict(list) - symbols = defaultdict(list) - surfaces = defaultdict(list) - for unit in json.load(open(os.path.join(TOPDIR, 'units.json'))): - - try: - assert unit['name'] not in names - except AssertionError: - msg = 'Two units with same name in units.json: %s' % unit['name'] - raise Exception(msg) - - obj = c.Unit(name=unit['name'], surfaces=unit['surfaces'], - entity=ENTITIES[unit['entity']], uri=unit['URI'], - symbols=unit['symbols'], dimensions=unit['dimensions']) - - names[unit['name']] = obj - - for symbol in unit['symbols']: - surfaces[symbol].append(obj) - lowers[symbol.lower()].append(obj) - if unit['entity'] == 'currency': - symbols[symbol].append(obj) - - for surface in unit['surfaces']: - surfaces[surface].append(obj) - lowers[surface.lower()].append(obj) - split = surface.split() - index = None - if ' per ' in surface: - index = split.index('per') - 1 - elif 'degree ' in surface: - index = split.index('degree') - if index is not None: - plural = ' '.join([i if num != index else - PLURALS.plural(split[index]) for num, i in - enumerate(split)]) - else: - plural = PLURALS.plural(surface) - if plural != surface: - surfaces[plural].append(obj) - lowers[plural.lower()].append(obj) - - dimensions_uni = get_dimensions_units(names) - - return names, surfaces, lowers, symbols, dimensions_uni - -NAMES, UNITS, LOWER_UNITS, SYMBOLS, DERIVED_UNI = load_units() +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""quantulum unit and entity loading functions.""" + +# Standard library +import os +import json +from collections import defaultdict + +# Dependencies +import inflect + +# Quantulum +from . import classes as c + +TOPDIR = os.path.dirname(__file__) or "." + +PLURALS = inflect.engine() + + +############################################################################### +def get_key_from_dimensions(dimensions): + """ + Get a key for DERIVED_UNI or DERIVED_ENT. + + Translate dimensionality into key for DERIVED_UNI and DERIVED_ENT + dictionaries. + """ + return tuple(tuple(i.items()) for i in dimensions) + + +############################################################################### +def get_dimension_permutations(entities, dimensions): + """Get all possible dimensional definitions for an entity.""" + new_dimensions = defaultdict(int) + for item in dimensions: + new = entities[item['base']].dimensions + if new: + for new_item in new: + new_dimensions[new_item['base']] += new_item['power'] * \ + item['power'] + else: + new_dimensions[item['base']] += item['power'] + + final = [[{'base': i[0], 'power': i[1]} for i in list(new_dimensions.items())]] + final.append(dimensions) + final = [sorted(i, key=lambda x: x['base']) for i in final] + + candidates = [] + for item in final: + if item not in candidates: + candidates.append(item) + + return candidates + + +############################################################################### +def load_entities(): + """Load entities from JSON file.""" + path = os.path.join(TOPDIR, 'entities.json') + entities = json.load(open(path, encoding="utf-8")) + names = [i['name'] for i in entities] + + try: + assert len(set(names)) == len(entities) + except AssertionError: + raise Exception('Entities with same name: %s' % [i for i in names if + names.count(i) > 1]) + + entities = dict((k['name'], c.Entity(name=k['name'], + dimensions=k['dimensions'], + uri=k['URI'])) for k in entities) + + dimensions_ent = defaultdict(list) + for ent in entities: + if not entities[ent].dimensions: + continue + perms = get_dimension_permutations(entities, entities[ent].dimensions) + for perm in perms: + key = get_key_from_dimensions(perm) + dimensions_ent[key].append(entities[ent]) + + return entities, dimensions_ent + +ENTITIES, DERIVED_ENT = load_entities() + + +############################################################################### +def get_dimensions_units(names): + """Create dictionary of unit dimensions.""" + dimensions_uni = {} + + for name in names: + + key = get_key_from_dimensions(names[name].dimensions) + dimensions_uni[key] = names[name] + plain_dimensions = [{'base': name, 'power': 1}] + key = get_key_from_dimensions(plain_dimensions) + dimensions_uni[key] = names[name] + + if not names[name].dimensions: + names[name].dimensions = plain_dimensions + + names[name].dimensions = [{'base': names[i['base']].name, + 'power': i['power']} for i in + names[name].dimensions] + + return dimensions_uni + + +############################################################################### +def load_units(): + """Load units from JSON file.""" + names = {} + lowers = defaultdict(list) + symbols = defaultdict(list) + surfaces = defaultdict(list) + for unit in json.load(open(os.path.join(TOPDIR, 'units.json'), encoding="utf-8")): + + try: + assert unit['name'] not in names + except AssertionError: + msg = 'Two units with same name in units.json: %s' % unit['name'] + raise Exception(msg) + + obj = c.Unit(name=unit['name'], surfaces=unit['surfaces'], + entity=ENTITIES[unit['entity']], uri=unit['URI'], + symbols=unit['symbols'], dimensions=unit['dimensions']) + + names[unit['name']] = obj + + for symbol in unit['symbols']: + surfaces[symbol].append(obj) + lowers[symbol.lower()].append(obj) + if unit['entity'] == 'currency': + symbols[symbol].append(obj) + + for surface in unit['surfaces']: + surfaces[surface].append(obj) + lowers[surface.lower()].append(obj) + split = surface.split() + index = None + if ' per ' in surface: + index = split.index('per') - 1 + elif 'degree ' in surface: + index = split.index('degree') + if index is not None: + plural = ' '.join([i if num != index else + PLURALS.plural(split[index]) for num, i in + enumerate(split)]) + else: + plural = PLURALS.plural(surface) + if plural != surface: + surfaces[plural].append(obj) + lowers[plural.lower()].append(obj) + + dimensions_uni = get_dimensions_units(names) + + return names, surfaces, lowers, symbols, dimensions_uni + +NAMES, UNITS, LOWER_UNITS, SYMBOLS, DERIVED_UNI = load_units() diff --git a/quantulum/parser.py b/quantulum/parser.py index 7da1c5e..37010e3 100644 --- a/quantulum/parser.py +++ b/quantulum/parser.py @@ -1,466 +1,459 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""quantulum parser.""" - -# Standard library -import re -import logging -from fractions import Fraction -from collections import defaultdict - -# Quantulum -from . import load as l -from . import regex as r -from . import classes as c -from . import classifier as clf - - -############################################################################### -def clean_surface(surface, span): - """Remove spurious characters from a quantity's surface.""" - surface = surface.replace('-', ' ') - no_start = ['and', ' '] - no_end = [' and', ' '] - - found = True - while found: - found = False - for word in no_start: - if surface.lower().startswith(word): - surface = surface[len(word):] - span = (span[0] + len(word), span[1]) - found = True - for word in no_end: - if surface.lower().endswith(word): - surface = surface[:-len(word)] - span = (span[0], span[1] - len(word)) - found = True - - if not surface: - return None, None - - split = surface.lower().split() - if split[0] in ['one', 'a', 'an'] and len(split) > 1 and split[1] in \ - r.UNITS + r.TENS: - span = (span[0] + len(surface.split()[0]) + 1, span[1]) - surface = ' '.join(surface.split()[1:]) - - return surface, span - - -############################################################################### -def extract_spellout_values(text): - """Convert spelled out numbers in a given text to digits.""" - values = [] - for item in r.REG_TXT.finditer(text): - surface, span = clean_surface(item.group(0), item.span()) - if not surface or surface.lower() in r.SCALES: - continue - curr = result = 0.0 - for word in surface.split(): - try: - scale, increment = 1, float(word.lower()) - except ValueError: - scale, increment = r.NUMWORDS[word.lower()] - curr = curr * scale + increment - if scale > 100: - result += curr - curr = 0.0 - values.append({'old_surface': surface, - 'old_span': span, - 'new_surface': unicode(result + curr)}) - - for item in re.finditer(r'\d+(,\d{3})+', text): - values.append({'old_surface': item.group(0), - 'old_span': item.span(), - 'new_surface': unicode(item.group(0).replace(',', ''))}) - - return sorted(values, key=lambda x: x['old_span'][0]) - - -############################################################################### -def substitute_values(text, values): - """Convert spelled out numbers in a given text to digits.""" - shift, final_text, shifts = 0, text, defaultdict(int) - for value in values: - first = value['old_span'][0] + shift - second = value['old_span'][1] + shift - new_s = value['new_surface'] - final_text = final_text[0:first] + new_s + final_text[second:] - shift += len(new_s) - len(value['old_surface']) - for char in range(first + 1, len(final_text)): - shifts[char] = shift - - logging.debug(u'Text after numeric conversion: "%s"', final_text) - - return final_text, shifts - - -############################################################################### -def callback(pattern): - """Regex callback function.""" - return ' %s' % (r.UNI_FRAC[pattern.group(0)]) - - -############################################################################### -def get_values(item): - """Extract value from regex hit.""" - fracs = r'|'.join(r.UNI_FRAC) - - value = item.group(2) - value = re.sub(ur'(?<=\d)(%s)10' % r.MULTIPLIERS, 'e', value) - value = re.sub(fracs, callback, value, re.IGNORECASE) - value = re.sub(' +', ' ', value) - - range_separator = re.findall(ur'\d+ ?(-|and|(?:- ?)?to) ?\d', value) - uncer_separator = re.findall(ur'\d+ ?(\+/-|±) ?\d', value) - fract_separator = re.findall(ur'\d+/\d+', value) - - uncertainty = None - if range_separator: - values = value.split(range_separator[0]) - values = [float(re.sub(r'-$', '', i)) for i in values] - elif uncer_separator: - values = [float(i) for i in value.split(uncer_separator[0])] - uncertainty = values[1] - values = [values[0]] - elif fract_separator: - values = value.split() - if len(values) > 1: - values = [float(values[0]) + float(Fraction(values[1]))] - else: - values = [float(Fraction(values[0]))] - else: - values = [float(re.sub(r'-$', '', value))] - - logging.debug(u'\tUncertainty: %s', uncertainty) - logging.debug(u'\tValues: %s', values) - - return uncertainty, values - - -############################################################################### -def build_unit_name(dimensions): - """Build the name of the unit from its dimensions.""" - name = '' - - for unit in dimensions: - if unit['power'] < 0: - name += 'per ' - power = abs(unit['power']) - if power == 1: - name += unit['base'] - elif power == 2: - name += 'square ' + unit['base'] - elif power == 3: - name += 'cubic ' + unit['base'] - elif power > 3: - name += unit['base'] + ' to the %g' % power - name += ' ' - - name = name.strip() - - logging.debug(u'\tUnit inferred name: %s', name) - - return name - - -############################################################################### -def get_unit_from_dimensions(dimensions, text): - """Reconcile a unit based on its dimensionality.""" - key = l.get_key_from_dimensions(dimensions) - - try: - unit = l.DERIVED_UNI[key] - except KeyError: - logging.debug(u'\tCould not find unit for: %s', key) - unit = c.Unit(name=build_unit_name(dimensions), - dimensions=dimensions, - entity=get_entity_from_dimensions(dimensions, text)) - - return unit - - -############################################################################### -def get_entity_from_dimensions(dimensions, text): - """ - Infer the underlying entity of a unit (e.g. "volume" for "m^3"). - - Just based on the unit's dimensionality if the classifier is disabled. - """ - new_dimensions = [{'base': l.NAMES[i['base']].entity.name, - 'power': i['power']} for i in dimensions] - - final_dimensions = sorted(new_dimensions, key=lambda x: x['base']) - key = l.get_key_from_dimensions(final_dimensions) - - try: - if clf.USE_CLF: - ent = clf.disambiguate_entity(key, text) - else: - ent = l.DERIVED_ENT[key][0] - except IndexError: - logging.debug(u'\tCould not find entity for: %s', key) - ent = c.Entity(name='unknown', dimensions=new_dimensions) - - return ent - - -############################################################################### -def parse_unit(item, group, slash): - """Parse surface and power from unit text.""" - surface = item.group(group).replace('.', '') - power = re.findall(r'\-?[0-9%s]+' % r.SUPERSCRIPTS, surface) - - if power: - power = [r.UNI_SUPER[i] if i in r.UNI_SUPER else i for i - in power] - power = ''.join(power) - new_power = (-1 * int(power) if slash else int(power)) - surface = re.sub(r'\^?\-?[0-9%s]+' % r.SUPERSCRIPTS, '', surface) - - elif re.findall(r'\bcubed\b', surface): - new_power = (-3 if slash else 3) - surface = re.sub(r'\bcubed\b', '', surface).strip() - - elif re.findall(r'\bsquared\b', surface): - new_power = (-2 if slash else 2) - surface = re.sub(r'\bsquared\b', '', surface).strip() - - else: - new_power = (-1 if slash else 1) - - return surface, new_power - - -############################################################################### -def get_unit(item, text): - """Extract unit from regex hit.""" - group_units = [1, 4, 6, 8, 10] - group_operators = [3, 5, 7, 9] - - item_units = [item.group(i) for i in group_units if item.group(i)] - - if len(item_units) == 0: - unit = l.NAMES['dimensionless'] - else: - dimensions, slash = [], False - for group in sorted(group_units + group_operators): - if not item.group(group): - continue - if group in group_units: - surface, power = parse_unit(item, group, slash) - if clf.USE_CLF: - base = clf.disambiguate_unit(surface, text).name - else: - base = l.UNITS[surface][0].name - dimensions += [{'base': base, 'power': power}] - elif not slash: - slash = any(i in item.group(group) for i in [u'/', u' per ']) - - unit = get_unit_from_dimensions(dimensions, text) - - logging.debug(u'\tUnit: %s', unit) - logging.debug(u'\tEntity: %s', unit.entity) - - return unit - - -############################################################################### -def get_surface(shifts, orig_text, item, text): - """Extract surface from regex hit.""" - span = item.span() - logging.debug(u'\tInitial span: %s ("%s")', span, text[span[0]:span[1]]) - - real_span = (span[0] - shifts[span[0]], span[1] - shifts[span[1] - 1]) - surface = orig_text[real_span[0]:real_span[1]] - logging.debug(u'\tShifted span: %s ("%s")', real_span, surface) - - while any(surface.endswith(i) for i in [' ', '-']): - surface = surface[:-1] - real_span = (real_span[0], real_span[1] - 1) - - while surface.startswith(' '): - surface = surface[1:] - real_span = (real_span[0] + 1, real_span[1]) - - logging.debug(u'\tFinal span: %s ("%s")', real_span, surface) - return surface, real_span - - -############################################################################### -def is_quote_artifact(orig_text, span): - """Distinguish between quotes and units.""" - res = False - cursor = re.finditer(r'("|\')[^ .,:;?!()*+-].*?("|\')', orig_text) - - for item in cursor: - if item.span()[1] == span[1]: - res = True - - return res - - -############################################################################### -def build_quantity(orig_text, text, item, values, unit, surface, span, uncert): - """Build a Quantity object out of extracted information.""" - # Discard irrelevant txt2float extractions, cardinal numbers, codes etc. - if surface.lower() in ['a', 'an', 'one'] or \ - re.search(r'1st|2nd|3rd|[04-9]th', surface) or \ - re.search(r'\d+[A-Z]+\d+', surface) or \ - re.search(r'\ba second\b', surface, re.IGNORECASE): - logging.debug(u'\tMeaningless quantity ("%s"), discard', surface) - return - - # Usually "$3T" does not stand for "dollar tesla" - elif unit.entity.dimensions and \ - unit.entity.dimensions[0]['base'] == 'currency': - if len(unit.dimensions) > 1: - try: - suffix = re.findall(r'\d(K|M|B|T)\b(.*?)$', surface)[0] - values = [i * r.SUFFIXES[suffix[0]] for i in values] - unit = l.UNITS[unit.dimensions[0]['base']][0] - if suffix[1]: - surface = surface[:surface.find(suffix[1])] - span = (span[0], span[1] - len(suffix[1])) - logging.debug(u'\tCorrect for "$3T" pattern') - except IndexError: - pass - else: - try: - suffix = re.findall(r'%s(K|M|B|T)\b' % re.escape(surface), - orig_text)[0] - surface += suffix - span = (span[0], span[1] + 1) - values = [i * r.SUFFIXES[suffix] for i in values] - logging.debug(u'\tCorrect for "$3T" pattern') - except IndexError: - pass - - # Usually "1990s" stands for the decade, not the amount of seconds - elif re.match(r'[1-2]\d\d0s', surface): - unit = l.NAMES['dimensionless'] - surface = surface[:-1] - span = (span[0], span[1] - 1) - logging.debug(u'\tCorrect for decade pattern') - - # Usually "in" stands for the preposition, not inches - elif unit.dimensions[-1]['base'] == 'inch' and \ - re.search(r' in$', surface) and '/' not in surface: - if len(unit.dimensions) > 1: - unit = get_unit_from_dimensions(unit.dimensions[:-1], orig_text) - else: - unit = l.NAMES['dimensionless'] - surface = surface[:-3] - span = (span[0], span[1] - 3) - logging.debug(u'\tCorrect for "in" pattern') - - elif is_quote_artifact(text, item.span()): - if len(unit.dimensions) > 1: - unit = get_unit_from_dimensions(unit.dimensions[:-1], orig_text) - else: - unit = l.NAMES['dimensionless'] - surface = surface[:-1] - span = (span[0], span[1] - 1) - logging.debug(u'\tCorrect for quotes') - - elif re.search(r' time$', surface) and len(unit.dimensions) > 1 and \ - unit.dimensions[-1]['base'] == 'count': - unit = get_unit_from_dimensions(unit.dimensions[:-1], orig_text) - surface = surface[:-5] - span = (span[0], span[1] - 5) - logging.debug(u'\tCorrect for "time"') - - objs = [] - for value in values: - obj = c.Quantity(value=value, - unit=unit, - surface=surface, - span=span, - uncertainty=uncert) - objs.append(obj) - - return objs - - -############################################################################### -def clean_text(text): - """Clean text before parsing.""" - # Replace a few nasty unicode characters with their ASCII equivalent - maps = {u'×': u'x', u'–': u'-', u'−': '-'} - for element in maps: - text = text.replace(element, maps[element]) - - # Replace genitives - text = re.sub(r'(?<=\w)\'s\b|(?<=\w)s\'(?!\w)', ' ', text) - - logging.debug(u'Clean text: "%s"', text) - - return text - - -############################################################################### -def parse(text, verbose=False): - """Extract all quantities from unstructured text.""" - log_format = ('%(asctime)s --- %(message)s') - logging.basicConfig(format=log_format) - root = logging.getLogger() - - if verbose: - level = root.level - root.setLevel(logging.DEBUG) - logging.debug(u'Verbose mode') - - if isinstance(text, str): - text = text.decode('utf-8') - logging.debug(u'Converted string to unicode (assume utf-8 encoding)') - - orig_text = text - logging.debug(u'Original text: "%s"', orig_text) - - text = clean_text(text) - values = extract_spellout_values(text) - text, shifts = substitute_values(text, values) - - quantities = [] - for item in r.REG_DIM.finditer(text): - - groups = dict([i for i in item.groupdict().items() if i[1] and - i[1].strip()]) - logging.debug(u'Quantity found: %s', groups) - - try: - uncert, values = get_values(item) - except ValueError as err: - logging.debug(u'Could not parse quantity: %s', err) - - unit = get_unit(item, text) - surface, span = get_surface(shifts, orig_text, item, text) - objs = build_quantity(orig_text, text, item, values, unit, surface, - span, uncert) - if objs is not None: - quantities += objs - - if verbose: - root.level = level - - return quantities - - -############################################################################### -def inline_parse(text, verbose=False): - """Extract all quantities from unstructured text.""" - if isinstance(text, str): - text = text.decode('utf-8') - - parsed = parse(text, verbose=verbose) - - shift = 0 - for quantity in parsed: - index = quantity.span[1] + shift - to_add = u' {' + unicode(quantity) + u'}' - text = text[0:index] + to_add + text[index:] - shift += len(to_add) - - return text +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""quantulum parser.""" + +# Standard library +import re +import logging +from fractions import Fraction +from collections import defaultdict + +# Quantulum +from . import load as l +from . import regex as r +from . import classes as c +from . import classifier as clf + + +############################################################################### +def clean_surface(surface, span): + """Remove spurious characters from a quantity's surface.""" + surface = surface.replace('-', ' ') + no_start = ['and', ' '] + no_end = [' and', ' '] + + found = True + while found: + found = False + for word in no_start: + if surface.lower().startswith(word): + surface = surface[len(word):] + span = (span[0] + len(word), span[1]) + found = True + for word in no_end: + if surface.lower().endswith(word): + surface = surface[:-len(word)] + span = (span[0], span[1] - len(word)) + found = True + + if not surface: + return None, None + + split = surface.lower().split() + if split[0] in ['one', 'a', 'an'] and len(split) > 1 and split[1] in \ + r.UNITS + r.TENS: + span = (span[0] + len(surface.split()[0]) + 1, span[1]) + surface = ' '.join(surface.split()[1:]) + + return surface, span + + +############################################################################### +def extract_spellout_values(text): + """Convert spelled out numbers in a given text to digits.""" + values = [] + for item in r.REG_TXT.finditer(text): + surface, span = clean_surface(item.group(0), item.span()) + if not surface or surface.lower() in r.SCALES: + continue + curr = result = 0.0 + for word in surface.split(): + try: + scale, increment = 1, float(word.lower()) + except ValueError: + scale, increment = r.NUMWORDS[word.lower()] + curr = curr * scale + increment + if scale > 100: + result += curr + curr = 0.0 + values.append({'old_surface': surface, + 'old_span': span, + 'new_surface': str(result + curr)}) + + for item in re.finditer(r'\d+(,\d{3})+', text): + values.append({'old_surface': item.group(0), + 'old_span': item.span(), + 'new_surface': str(item.group(0).replace(',', ''))}) + + return sorted(values, key=lambda x: x['old_span'][0]) + + +############################################################################### +def substitute_values(text, values): + """Convert spelled out numbers in a given text to digits.""" + shift, final_text, shifts = 0, text, defaultdict(int) + for value in values: + first = value['old_span'][0] + shift + second = value['old_span'][1] + shift + new_s = value['new_surface'] + final_text = final_text[0:first] + new_s + final_text[second:] + shift += len(new_s) - len(value['old_surface']) + for char in range(first + 1, len(final_text)): + shifts[char] = shift + + logging.debug('Text after numeric conversion: "%s"', final_text) + + return final_text, shifts + + +############################################################################### +def callback(pattern): + """Regex callback function.""" + return ' %s' % (r.UNI_FRAC[pattern.group(0)]) + + +############################################################################### +def get_values(item): + """Extract value from regex hit.""" + fracs = r'|'.join(r.UNI_FRAC) + + value = item.group(2) + value = re.sub(r'(?<=\d)(%s)10' % r.MULTIPLIERS, 'e', value) + value = re.sub(fracs, callback, value, re.IGNORECASE) + value = re.sub(' +', ' ', value) + + range_separator = re.findall(r'\d+ ?(-|and|(?:- ?)?to) ?\d', value) + uncer_separator = re.findall(r'\d+ ?(\+/-|±) ?\d', value) + fract_separator = re.findall(r'\d+/\d+', value) + + uncertainty = None + if range_separator: + values = value.split(range_separator[0]) + values = [float(re.sub(r'-$', '', i)) for i in values] + elif uncer_separator: + values = [float(i) for i in value.split(uncer_separator[0])] + uncertainty = values[1] + values = [values[0]] + elif fract_separator: + values = value.split() + if len(values) > 1: + values = [float(values[0]) + float(Fraction(values[1]))] + else: + values = [float(Fraction(values[0]))] + else: + values = [float(re.sub(r'-$', '', value))] + + logging.debug('\tUncertainty: %s', uncertainty) + logging.debug('\tValues: %s', values) + + return uncertainty, values + + +############################################################################### +def build_unit_name(dimensions): + """Build the name of the unit from its dimensions.""" + name = '' + + for unit in dimensions: + if unit['power'] < 0: + name += 'per ' + power = abs(unit['power']) + if power == 1: + name += unit['base'] + elif power == 2: + name += 'square ' + unit['base'] + elif power == 3: + name += 'cubic ' + unit['base'] + elif power > 3: + name += unit['base'] + ' to the %g' % power + name += ' ' + + name = name.strip() + + logging.debug('\tUnit inferred name: %s', name) + + return name + + +############################################################################### +def get_unit_from_dimensions(dimensions, text): + """Reconcile a unit based on its dimensionality.""" + key = l.get_key_from_dimensions(dimensions) + + try: + unit = l.DERIVED_UNI[key] + except KeyError: + logging.debug('\tCould not find unit for: %s', key) + unit = c.Unit(name=build_unit_name(dimensions), + dimensions=dimensions, + entity=get_entity_from_dimensions(dimensions, text)) + + return unit + + +############################################################################### +def get_entity_from_dimensions(dimensions, text): + """ + Infer the underlying entity of a unit (e.g. "volume" for "m^3"). + + Just based on the unit's dimensionality if the classifier is disabled. + """ + new_dimensions = [{'base': l.NAMES[i['base']].entity.name, + 'power': i['power']} for i in dimensions] + + final_dimensions = sorted(new_dimensions, key=lambda x: x['base']) + key = l.get_key_from_dimensions(final_dimensions) + + try: + if clf.USE_CLF: + ent = clf.disambiguate_entity(key, text) + else: + ent = l.DERIVED_ENT[key][0] + except IndexError: + logging.debug('\tCould not find entity for: %s', key) + ent = c.Entity(name='unknown', dimensions=new_dimensions) + + return ent + + +############################################################################### +def parse_unit(item, group, slash): + """Parse surface and power from unit text.""" + surface = item.group(group).replace('.', '') + power = re.findall(r'\-?[0-9%s]+' % r.SUPERSCRIPTS, surface) + + if power: + power = [r.UNI_SUPER[i] if i in r.UNI_SUPER else i for i + in power] + power = ''.join(power) + new_power = (-1 * int(power) if slash else int(power)) + surface = re.sub(r'\^?\-?[0-9%s]+' % r.SUPERSCRIPTS, '', surface) + + elif re.findall(r'\bcubed\b', surface): + new_power = (-3 if slash else 3) + surface = re.sub(r'\bcubed\b', '', surface).strip() + + elif re.findall(r'\bsquared\b', surface): + new_power = (-2 if slash else 2) + surface = re.sub(r'\bsquared\b', '', surface).strip() + + else: + new_power = (-1 if slash else 1) + + return surface, new_power + + +############################################################################### +def get_unit(item, text): + """Extract unit from regex hit.""" + group_units = [1, 4, 6, 8, 10] + group_operators = [3, 5, 7, 9] + + item_units = [item.group(i) for i in group_units if item.group(i)] + + if len(item_units) == 0: + unit = l.NAMES['dimensionless'] + else: + dimensions, slash = [], False + for group in sorted(group_units + group_operators): + if not item.group(group): + continue + if group in group_units: + surface, power = parse_unit(item, group, slash) + if clf.USE_CLF: + base = clf.disambiguate_unit(surface, text).name + else: + base = l.UNITS[surface][0].name + dimensions += [{'base': base, 'power': power}] + elif not slash: + slash = any(i in item.group(group) for i in ['/', ' per ']) + + unit = get_unit_from_dimensions(dimensions, text) + + logging.debug('\tUnit: %s', unit) + logging.debug('\tEntity: %s', unit.entity) + + return unit + + +############################################################################### +def get_surface(shifts, orig_text, item, text): + """Extract surface from regex hit.""" + span = item.span() + logging.debug('\tInitial span: %s ("%s")', span, text[span[0]:span[1]]) + + real_span = (span[0] - shifts[span[0]], span[1] - shifts[span[1] - 1]) + surface = orig_text[real_span[0]:real_span[1]] + logging.debug('\tShifted span: %s ("%s")', real_span, surface) + + while any(surface.endswith(i) for i in [' ', '-']): + surface = surface[:-1] + real_span = (real_span[0], real_span[1] - 1) + + while surface.startswith(' '): + surface = surface[1:] + real_span = (real_span[0] + 1, real_span[1]) + + logging.debug('\tFinal span: %s ("%s")', real_span, surface) + return surface, real_span + + +############################################################################### +def is_quote_artifact(orig_text, span): + """Distinguish between quotes and units.""" + res = False + cursor = re.finditer(r'("|\')[^ .,:;?!()*+-].*?("|\')', orig_text) + + for item in cursor: + if item.span()[1] == span[1]: + res = True + + return res + + +############################################################################### +def build_quantity(orig_text, text, item, values, unit, surface, span, uncert): + """Build a Quantity object out of extracted information.""" + # Discard irrelevant txt2float extractions, cardinal numbers, codes etc. + if surface.lower() in ['a', 'an', 'one'] or \ + re.search(r'1st|2nd|3rd|[04-9]th', surface) or \ + re.search(r'\d+[A-Z]+\d+', surface) or \ + re.search(r'\ba second\b', surface, re.IGNORECASE): + logging.debug('\tMeaningless quantity ("%s"), discard', surface) + return + + # Usually "$3T" does not stand for "dollar tesla" + elif unit.entity.dimensions and \ + unit.entity.dimensions[0]['base'] == 'currency': + if len(unit.dimensions) > 1: + try: + suffix = re.findall(r'\d(K|M|B|T)\b(.*?)$', surface)[0] + values = [i * r.SUFFIXES[suffix[0]] for i in values] + unit = l.UNITS[unit.dimensions[0]['base']][0] + if suffix[1]: + surface = surface[:surface.find(suffix[1])] + span = (span[0], span[1] - len(suffix[1])) + logging.debug('\tCorrect for "$3T" pattern') + except IndexError: + pass + else: + try: + suffix = re.findall(r'%s(K|M|B|T)\b' % re.escape(surface), + orig_text)[0] + surface += suffix + span = (span[0], span[1] + 1) + values = [i * r.SUFFIXES[suffix] for i in values] + logging.debug('\tCorrect for "$3T" pattern') + except IndexError: + pass + + # Usually "1990s" stands for the decade, not the amount of seconds + elif re.match(r'[1-2]\d\d0s', surface): + unit = l.NAMES['dimensionless'] + surface = surface[:-1] + span = (span[0], span[1] - 1) + logging.debug('\tCorrect for decade pattern') + + # Usually "in" stands for the preposition, not inches + elif unit.dimensions[-1]['base'] == 'inch' and \ + re.search(r' in$', surface) and '/' not in surface: + if len(unit.dimensions) > 1: + unit = get_unit_from_dimensions(unit.dimensions[:-1], orig_text) + else: + unit = l.NAMES['dimensionless'] + surface = surface[:-3] + span = (span[0], span[1] - 3) + logging.debug('\tCorrect for "in" pattern') + + elif is_quote_artifact(text, item.span()): + if len(unit.dimensions) > 1: + unit = get_unit_from_dimensions(unit.dimensions[:-1], orig_text) + else: + unit = l.NAMES['dimensionless'] + surface = surface[:-1] + span = (span[0], span[1] - 1) + logging.debug('\tCorrect for quotes') + + elif re.search(r' time$', surface) and len(unit.dimensions) > 1 and \ + unit.dimensions[-1]['base'] == 'count': + unit = get_unit_from_dimensions(unit.dimensions[:-1], orig_text) + surface = surface[:-5] + span = (span[0], span[1] - 5) + logging.debug('\tCorrect for "time"') + + objs = [] + for value in values: + obj = c.Quantity(value=value, + unit=unit, + surface=surface, + span=span, + uncertainty=uncert) + objs.append(obj) + + return objs + + +############################################################################### +def clean_text(text): + """Clean text before parsing.""" + # Replace a few nasty unicode characters with their ASCII equivalent + maps = {'×': 'x', '–': '-', '−': '-'} + for element in maps: + text = text.replace(element, maps[element]) + + # Replace genitives + text = re.sub(r'(?<=\w)\'s\b|(?<=\w)s\'(?!\w)', ' ', text) + + logging.debug('Clean text: "%s"', text) + + return text + + +############################################################################### +def parse(text, verbose=False): + """Extract all quantities from unstructured text.""" + log_format = ('%(asctime)s --- %(message)s') + logging.basicConfig(format=log_format) + root = logging.getLogger() + + if verbose: + level = root.level + root.setLevel(logging.DEBUG) + logging.debug('Verbose mode') + + orig_text = text + logging.debug('Original text: "%s"', orig_text) + + text = clean_text(text) + values = extract_spellout_values(text) + text, shifts = substitute_values(text, values) + + quantities = [] + for item in r.REG_DIM.finditer(text): + + groups = dict([i for i in list(item.groupdict().items()) if i[1] and + i[1].strip()]) + logging.debug('Quantity found: %s', groups) + + try: + uncert, values = get_values(item) + except ValueError as err: + logging.debug('Could not parse quantity: %s', err) + + unit = get_unit(item, text) + surface, span = get_surface(shifts, orig_text, item, text) + objs = build_quantity(orig_text, text, item, values, unit, surface, + span, uncert) + if objs is not None: + quantities += objs + + if verbose: + root.level = level + + return quantities + + +############################################################################### +def inline_parse(text, verbose=False): + """Extract all quantities from unstructured text.""" + parsed = parse(text, verbose=verbose) + + shift = 0 + for quantity in parsed: + index = quantity.span[1] + shift + to_add = ' {' + str(quantity) + '}' + text = text[0:index] + to_add + text[index:] + shift += len(to_add) + + return text diff --git a/quantulum/regex.py b/quantulum/regex.py index 9ddf095..77154ab 100644 --- a/quantulum/regex.py +++ b/quantulum/regex.py @@ -1,134 +1,134 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""quantulum regex functions.""" - -# Standard library -import re - -# Quantulum -from . import load as l - -UNITS = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', - 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', - 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen'] - -TENS = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', - 'eighty', 'ninety'] - -SCALES = ['hundred', 'thousand', 'million', 'billion', 'trillion'] - - -############################################################################### -def get_numwords(): - """Convert number words to integers in a given text.""" - numwords = {'and': (1, 0), 'a': (1, 1), 'an': (1, 1)} - - for idx, word in enumerate(UNITS): - numwords[word] = (1, idx) - for idx, word in enumerate(TENS): - numwords[word] = (1, idx * 10) - for idx, word in enumerate(SCALES): - numwords[word] = (10 ** (idx * 3 or 2), 0) - - all_numbers = ur'|'.join(ur'\b%s\b' % i for i in numwords.keys() if i) - - return all_numbers, numwords - - -############################################################################### - -SUFFIXES = {'K': 1e3, 'M': 1e6, 'B': 1e9, 'T': 1e12} - -UNI_SUPER = {u'¹': '1', u'²': '2', u'³': '3', u'⁴': '4', u'⁵': '5', - u'⁶': '6', u'⁷': '7', u'⁸': '8', u'⁹': '9', u'⁰': '0'} - -UNI_FRAC = {u'¼': '1/4', u'½': '1/2', u'¾': '3/4', u'⅐': '1/7', u'⅑': '1/9', - u'⅒': '1/10', u'⅓': '1/3', u'⅔': '2/3', u'⅕': '1/5', u'⅖': '2/5', - u'⅗': '3/5', u'⅘': '4/5', u'⅙': '1/6', u'⅚': '5/6', u'⅛': '1/8', - u'⅜': '3/8', u'⅝': '5/8', u'⅞': '7/8'} - -OPERATORS = {u'/': u' per ', u' per ': u' per ', u' a ': ' per ', - u'*': u' ', u' ': u' ', u'·': u' ', u'x': u' '} - -ALL_NUM, NUMWORDS = get_numwords() -FRACTIONS = re.escape(''.join(UNI_FRAC.keys())) -SUPERSCRIPTS = re.escape(''.join(UNI_SUPER.keys())) - -MULTIPLIERS = r'|'.join(ur'%s' % re.escape(i) for i in OPERATORS if - OPERATORS[i] == ' ') - -NUM_PATTERN = ur''' # Pattern for extracting a digit-based number - - (?: # required number - [+-]? # optional sign - \.?\d+ # required digits - (?:\.\d+)? # optional decimals - ) - (?: # optional exponent - (?:%s)? # multiplicative operators - (?:E|e|10\^?) # required exponent prefix - (?:[+-]?\d+|[%s]) # required exponent, superscript or normal - )? - (?: # optional fraction - \ \d+/\d+|\ ?[%s]|/\d+ - )? - -''' % (MULTIPLIERS, SUPERSCRIPTS, FRACTIONS) - -RAN_PATTERN = ur''' # Pattern for a range of numbers - - (?: # First number - (?(?:%s)(?![a-zA-Z]))? # Currencies, mainly - (?P%s)-? # Number - (?:(?P%s)?(?P(?:%s)%s)?) # Operator + Unit (1) - (?:(?P%s)?(?P(?:%s)%s)?) # Operator + Unit (2) - (?:(?P%s)?(?P(?:%s)%s)?) # Operator + Unit (3) - (?:(?P%s)?(?P(?:%s)%s)?) # Operator + Unit (4) - - ''' % tuple([all_symbols, RAN_PATTERN] + 4 * [all_ops, all_units, - exponent]) - - regex = re.compile(pattern, re.VERBOSE | re.IGNORECASE) - - return regex - -REG_DIM = get_units_regex() +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""quantulum regex functions.""" + +# Standard library +import re + +# Quantulum +from . import load as l + +UNITS = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', + 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', + 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen'] + +TENS = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', + 'eighty', 'ninety'] + +SCALES = ['hundred', 'thousand', 'million', 'billion', 'trillion'] + + +############################################################################### +def get_numwords(): + """Convert number words to integers in a given text.""" + numwords = {'and': (1, 0), 'a': (1, 1), 'an': (1, 1)} + + for idx, word in enumerate(UNITS): + numwords[word] = (1, idx) + for idx, word in enumerate(TENS): + numwords[word] = (1, idx * 10) + for idx, word in enumerate(SCALES): + numwords[word] = (10 ** (idx * 3 or 2), 0) + + all_numbers = r'|'.join(r'\b%s\b' % i for i in list(numwords.keys()) if i) + + return all_numbers, numwords + + +############################################################################### + +SUFFIXES = {'K': 1e3, 'M': 1e6, 'B': 1e9, 'T': 1e12} + +UNI_SUPER = {'¹': '1', '²': '2', '³': '3', '⁴': '4', '⁵': '5', + '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9', '⁰': '0'} + +UNI_FRAC = {'¼': '1/4', '½': '1/2', '¾': '3/4', '⅐': '1/7', '⅑': '1/9', + '⅒': '1/10', '⅓': '1/3', '⅔': '2/3', '⅕': '1/5', '⅖': '2/5', + '⅗': '3/5', '⅘': '4/5', '⅙': '1/6', '⅚': '5/6', '⅛': '1/8', + '⅜': '3/8', '⅝': '5/8', '⅞': '7/8'} + +OPERATORS = {'/': ' per ', ' per ': ' per ', ' a ': ' per ', + '*': ' ', ' ': ' ', '·': ' ', 'x': ' '} + +ALL_NUM, NUMWORDS = get_numwords() +FRACTIONS = re.escape(''.join(list(UNI_FRAC.keys()))) +SUPERSCRIPTS = re.escape(''.join(list(UNI_SUPER.keys()))) + +MULTIPLIERS = r'|'.join(r'%s' % re.escape(i) for i in OPERATORS if + OPERATORS[i] == ' ') + +NUM_PATTERN = r''' # Pattern for extracting a digit-based number + + (?: # required number + [+-]? # optional sign + \.?\d+ # required digits + (?:\.\d+)? # optional decimals + ) + (?: # optional exponent + (?:%s)? # multiplicative operators + (?:E|e|10\^?) # required exponent prefix + (?:[+-]?\d+|[%s]) # required exponent, superscript or normal + )? + (?: # optional fraction + \ \d+/\d+|\ ?[%s]|/\d+ + )? + +''' % (MULTIPLIERS, SUPERSCRIPTS, FRACTIONS) + +RAN_PATTERN = r''' # Pattern for a range of numbers + + (?: # First number + (?(?:%s)(?![a-zA-Z]))? # Currencies, mainly + (?P%s)-? # Number + (?:(?P%s)?(?P(?:%s)%s)?) # Operator + Unit (1) + (?:(?P%s)?(?P(?:%s)%s)?) # Operator + Unit (2) + (?:(?P%s)?(?P(?:%s)%s)?) # Operator + Unit (3) + (?:(?P%s)?(?P(?:%s)%s)?) # Operator + Unit (4) + + ''' % tuple([all_symbols, RAN_PATTERN] + 4 * [all_ops, all_units, + exponent]) + + regex = re.compile(pattern, re.VERBOSE | re.IGNORECASE) + + return regex + +REG_DIM = get_units_regex() diff --git a/quantulum/tests.py b/quantulum/tests.py index fcb8360..93bb910 100644 --- a/quantulum/tests.py +++ b/quantulum/tests.py @@ -1,156 +1,156 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""quantulum tests.""" - -# Standard library -import os -import re -import json -import unittest - -# Dependencies -import wikipedia - -# Quantulum -from . import load as l -from . import parser as p -from . import classes as c - -COLOR1 = '\033[94m%s\033[0m' -COLOR2 = '\033[91m%s\033[0m' -TOPDIR = os.path.dirname(__file__) or "." - - -############################################################################### -def embed_text(quants, beg_char, chunk, content): - """Embed quantities in text.""" - if quants: - end_char = max((chunk + 1) * 1000, quants[-1].span[1]) - text = content[beg_char:end_char] - shift = 0 - for quantity in quants: - index = quantity.span[1] - beg_char + shift - to_add = COLOR1 % (' {' + str(quantity) + '}') - text = text[0:index] + to_add + COLOR2 % text[index:] - shift += len(to_add) + len(COLOR2) - 6 - else: - end_char = (chunk + 1) * 1000 - text = content[beg_char:end_char] - - return text, end_char - - -############################################################################### -def wiki_test(page='CERN'): - """Download a wikipedia page and test the parser on its content. - - Pages full of units: - CERN - Hubble_Space_Telescope, - Herschel_Space_Observatory - """ - content = wikipedia.page(page).content - parsed = p.parse(content) - parts = int(round(len(content) * 1.0 / 1000)) - - print - end_char = 0 - for num, chunk in enumerate(range(parts)): - _ = os.system('clear') - print - quants = [j for j in parsed if chunk * 1000 < j.span[0] < (chunk + 1) * - 1000] - beg_char = max(chunk * 1000, end_char) - text, end_char = embed_text(quants, beg_char, chunk, content) - print COLOR2 % text - print - try: - _ = raw_input('--------- End part %d of %d\n' % (num + 1, parts)) - except (KeyboardInterrupt, EOFError): - return - - -############################################################################### -def get_quantity(test, item): - """Build a single quantity for the test.""" - try: - unit = l.NAMES[item['unit']] - except KeyError: - try: - entity = item['entity'] - except KeyError: - print ('Could not find %s, provide "dimensions" and' - ' "entity"' % item['unit']) - return - if entity == 'unknown': - dimensions = [{'base': l.NAMES[i['base']].entity.name, - 'power': i['power']} for i in - item['dimensions']] - entity = c.Entity(name='unknown', dimensions=dimensions) - elif entity in l.ENTITIES: - entity = l.ENTITIES[entity] - else: - print ('Could not find %s, provide "dimensions" and' - ' "entity"' % item['unit']) - return - unit = c.Unit(name=item['unit'], - dimensions=item['dimensions'], - entity=entity) - try: - span = re.finditer(re.escape(item['surface']), - test['req']).next().span() - except StopIteration: - print 'Surface mismatch for "%s"' % test['req'] - return - - uncert = None - if 'uncertainty' in item: - uncert = item['uncertainty'] - - quantity = c.Quantity(value=item['value'], - unit=unit, - surface=item['surface'], - span=span, - uncertainty=uncert) - - return quantity - - -############################################################################### -def load_tests(): - """Load all tests from tests.json.""" - path = os.path.join(TOPDIR, 'tests.json') - tests = json.load(open(path)) - - for test in tests: - res = [] - for item in test['res']: - quantity = get_quantity(test, item) - if quantity is None: - return - res.append(quantity) - test['res'] = [i for i in res] - - return tests - - -############################################################################### -class EndToEndTests(unittest.TestCase): - """Test suite for the quantulum project.""" - - def test_load_tests(self): - """Test for tests.load_test() function.""" - self.assertFalse(load_tests() is None) - - def test_parse(self): - """Test for parser.parse() function.""" - all_tests = load_tests() - for test in sorted(all_tests, key=lambda x: len(x['req'])): - self.assertEqual(p.parse(test['req']), test['res']) - - -############################################################################### -if __name__ == '__main__': - - unittest.main() +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""quantulum tests.""" + +# Standard library +import os +import re +import json +import unittest + +# Dependencies +import wikipedia + +# Quantulum +from . import load as l +from . import parser as p +from . import classes as c + +COLOR1 = '\033[94m%s\033[0m' +COLOR2 = '\033[91m%s\033[0m' +TOPDIR = os.path.dirname(__file__) or "." + + +############################################################################### +def embed_text(quants, beg_char, chunk, content): + """Embed quantities in text.""" + if quants: + end_char = max((chunk + 1) * 1000, quants[-1].span[1]) + text = content[beg_char:end_char] + shift = 0 + for quantity in quants: + index = quantity.span[1] - beg_char + shift + to_add = COLOR1 % (' {' + str(quantity) + '}') + text = text[0:index] + to_add + COLOR2 % text[index:] + shift += len(to_add) + len(COLOR2) - 6 + else: + end_char = (chunk + 1) * 1000 + text = content[beg_char:end_char] + + return text, end_char + + +############################################################################### +def wiki_test(page='CERN'): + """Download a wikipedia page and test the parser on its content. + + Pages full of units: + CERN + Hubble_Space_Telescope, + Herschel_Space_Observatory + """ + content = wikipedia.page(page).content + parsed = p.parse(content) + parts = int(round(len(content) * 1.0 / 1000)) + + print() + end_char = 0 + for num, chunk in enumerate(range(parts)): + _ = os.system('clear') + print() + quants = [j for j in parsed if chunk * 1000 < j.span[0] < (chunk + 1) * + 1000] + beg_char = max(chunk * 1000, end_char) + text, end_char = embed_text(quants, beg_char, chunk, content) + print(COLOR2 % text) + print() + try: + _ = input('--------- End part %d of %d\n' % (num + 1, parts)) + except (KeyboardInterrupt, EOFError): + return + + +############################################################################### +def get_quantity(test, item): + """Build a single quantity for the test.""" + try: + unit = l.NAMES[item['unit']] + except KeyError: + try: + entity = item['entity'] + except KeyError: + print(('Could not find %s, provide "dimensions" and' + ' "entity"' % item['unit'])) + return + if entity == 'unknown': + dimensions = [{'base': l.NAMES[i['base']].entity.name, + 'power': i['power']} for i in + item['dimensions']] + entity = c.Entity(name='unknown', dimensions=dimensions) + elif entity in l.ENTITIES: + entity = l.ENTITIES[entity] + else: + print(('Could not find %s, provide "dimensions" and' + ' "entity"' % item['unit'])) + return + unit = c.Unit(name=item['unit'], + dimensions=item['dimensions'], + entity=entity) + try: + span = re.finditer(re.escape(item['surface']), + test['req']).next().span() + except StopIteration: + print('Surface mismatch for "%s"' % test['req']) + return + + uncert = None + if 'uncertainty' in item: + uncert = item['uncertainty'] + + quantity = c.Quantity(value=item['value'], + unit=unit, + surface=item['surface'], + span=span, + uncertainty=uncert) + + return quantity + + +############################################################################### +def load_tests(): + """Load all tests from tests.json.""" + path = os.path.join(TOPDIR, 'tests.json') + tests = json.load(open(path)) + + for test in tests: + res = [] + for item in test['res']: + quantity = get_quantity(test, item) + if quantity is None: + return + res.append(quantity) + test['res'] = [i for i in res] + + return tests + + +############################################################################### +class EndToEndTests(unittest.TestCase): + """Test suite for the quantulum project.""" + + def test_load_tests(self): + """Test for tests.load_test() function.""" + self.assertFalse(load_tests() is None) + + def test_parse(self): + """Test for parser.parse() function.""" + all_tests = load_tests() + for test in sorted(all_tests, key=lambda x: len(x['req'])): + self.assertEqual(p.parse(test['req']), test['res']) + + +############################################################################### +if __name__ == '__main__': + + unittest.main() diff --git a/setup.py b/setup.py index 06b5458..90e1cd8 100644 --- a/setup.py +++ b/setup.py @@ -1,45 +1,45 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""quantulum setup file.""" - -import sys - -try: - from setuptools import setup -except ImportError: - print 'Please install or upgrade setuptools or pip to continue' - sys.exit(1) - -import quantulum - -setup( - name='quantulum', - packages=['quantulum'], - package_data={'quantulum': ['clf.pickle', 'units.json', 'entities.json', - 'tests.json', 'train.json', 'wiki.json']}, - description='Extract quantities from unstructured text.', - long_description=open('README.rst').read(), - download_url='https://github.com/marcolagi/quantulum/tarball/0.1', - version=quantulum.__version__, - url=quantulum.__url__, - author=quantulum.__author__, - author_email=quantulum.__author_email__, - license=quantulum.__license__, - test_suite='quantulum.tests.EndToEndTests', - keywords=['information extraction', 'quantities', 'units', 'measurements', - 'nlp', 'natural language processing', 'text mining', - 'text processing'], - install_requires=['inflect', 'stemming', 'wikipedia'], - classifiers=['Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'Operating System :: MacOS :: MacOS X', - 'Operating System :: Microsoft :: Windows', - 'Operating System :: POSIX', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Development Status :: 3 - Alpha', - 'Natural Language :: English', - 'Topic :: Text Processing :: Linguistic', - 'Topic :: Scientific/Engineering']) +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""quantulum setup file.""" + +import sys + +try: + from setuptools import setup +except ImportError: + print('Please install or upgrade setuptools or pip to continue') + sys.exit(1) + +import quantulum + +setup( + name='quantulum', + packages=['quantulum'], + package_data={'quantulum': ['clf.pickle', 'units.json', 'entities.json', + 'tests.json', 'train.json', 'wiki.json']}, + description='Extract quantities from unstructured text.', + long_description=open('README.rst').read(), + download_url='https://github.com/marcolagi/quantulum/tarball/0.1', + version=quantulum.__version__, + url=quantulum.__url__, + author=quantulum.__author__, + author_email=quantulum.__author_email__, + license=quantulum.__license__, + test_suite='quantulum.tests.EndToEndTests', + keywords=['information extraction', 'quantities', 'units', 'measurements', + 'nlp', 'natural language processing', 'text mining', + 'text processing'], + install_requires=['inflect', 'stemming', 'wikipedia'], + classifiers=['Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'Operating System :: MacOS :: MacOS X', + 'Operating System :: Microsoft :: Windows', + 'Operating System :: POSIX', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2.7', + 'Development Status :: 3 - Alpha', + 'Natural Language :: English', + 'Topic :: Text Processing :: Linguistic', + 'Topic :: Scientific/Engineering'])