diff --git a/.gitignore b/.gitignore index 9251e52..30ef2cc 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,9 @@ var/ pip-log.txt pip-delete-this-directory.txt +#Test de performance +tests/data/performance_data/ + # Unit test / coverage reports htmlcov/ .tox/ diff --git a/tests/Pruebas de Performance Textar.ipynb b/tests/Pruebas de Performance Textar.ipynb new file mode 100644 index 0000000..d0f9cdd --- /dev/null +++ b/tests/Pruebas de Performance Textar.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext memory_profiler\n", + "from textar import TextClassifier\n", + "import xml.etree.ElementTree as ET\n", + "from lxml import etree\n", + "import numpy as np\n", + "import re\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper funcs\n", + "\n", + "def parse_blog(tree, min_words=100):\n", + " dates = []\n", + " posts = []\n", + " for elem in tree:\n", + " post = None\n", + " if elem.tag == 'date':\n", + " date = elem.text\n", + " elif elem.tag == 'post':\n", + " post = elem.text\n", + " if post is not None: \n", + " words = re.findall('\\w+\\W',post)\n", + " if len(words) > min_words and np.mean(map(len,words))>2:\n", + " dates.append(date)\n", + " posts.append(post)\n", + " return dates, posts" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Configs\n", + "DATA_FOLDER = os.path.join('.','data','performance_data','blogs')\n", + "MAX_FILES = 10000" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "magic = '''\n", + " ]>'''\n", + "\n", + "parser = etree.XMLParser(recover=True)\n", + "\n", + "all_dates = []\n", + "all_posts = []\n", + "all_genders = []\n", + "all_ages = []\n", + "all_categories = []\n", + "\n", + "for file_name in os.listdir(DATA_FOLDER)[:MAX_FILES]:\n", + " id_f, gender, age, category, zodiac, ext = file_name.split('.')\n", + " with open(os.path.join(DATA_FOLDER, file_name), 'r') as f:\n", + " try:\n", + " tree = ET.fromstring(magic + f.read(), parser=parser)\n", + " dates, posts = parse_blog(tree)\n", + " all_posts += posts\n", + " all_dates += dates\n", + " all_genders += [gender] * len(dates)\n", + " all_ages += [age] * len(dates)\n", + " all_categories += [category] * len(dates)\n", + " except Exception as e:\n", + " pass\n", + " #print(\"Error en {:s}\".format(file_name))\n", + "all_ids = map(str, range(len(all_posts)))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "%%timeit\n", + "# Tiempo de la creacion del objeto\n", + "tc = TextClassifier(all_posts, all_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 loop, best of 3: 2.36 s per loop\n" + ] + } + ], + "source": [ + "%%timeit\n", + "# Tiempo de la busqueda\n", + "tc.get_similar(all_ids[1],max_similars=3, term_diff_max_rank=50)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 loop, best of 3: 17.4 s per loop\n" + ] + } + ], + "source": [ + "%%timeit\n", + "# Tiempo de creacion del clasificador\n", + "tc.make_classifier(\"topic\",all_ids, all_categories)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 loops, best of 3: 31.4 ms per loop\n" + ] + } + ], + "source": [ + "%%timeit\n", + "tc.classify(\"topic\", all_ids[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0., 0., 0., ..., 0., 0., 0.]])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row.toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['0', '1', '10', ..., '997', '998', '999'], \n", + " dtype='|S4')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [Root]", + "language": "python", + "name": "Python [Root]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + }, + "notify_time": "5" + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/tests/test_text_classifier.py b/tests/test_text_classifier.py index 0a24663..2c381d5 100644 --- a/tests/test_text_classifier.py +++ b/tests/test_text_classifier.py @@ -11,10 +11,9 @@ import os import codecs import numpy as np -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_20newsgroups sys.path.insert(0, os.path.abspath('..')) - from textar import TextClassifier @@ -32,7 +31,7 @@ def test_get_similar(self): "El edificio más antiguo tiene muchas cuadros caros porque era de un multimillonario", "El edificio más moderno tiene muchas programadoras que comen manzanas durante el almuerzo grupal" ], - ids=map(str, range(4)) + ids=list(map(str, range(4))) ) ids, distancias, palabras_comunes = tc.get_similar( @@ -42,10 +41,14 @@ def test_get_similar(self): self.assertEqual(ids, ['0', '3', '2', '1']) self.assertEqual( - palabras_comunes, + [ + sorted(palabras) + for palabras in palabras_comunes + ] + , [ [u'edificio', u'manzanas'], - [u'edificio', u'muchas', u'manzanas'], + [u'edificio', u'manzanas', u'muchas'], [u'edificio', u'muchas'], [u'muchas'] ] ) @@ -60,13 +63,13 @@ def test_classify(self): "Para hacer una torta de naranja se necesita harina, huevos, leche, ralladura de naranja y polvo de hornear", "Para hacer un lemon pie se necesita crema, ralladura de limón, huevos, leche y harina" ], - ids=map(str, range(6)) + ids=list(map(str, range(6))) ) # entrena un clasificador tc.make_classifier( name="recetas_classifier", - ids=map(str, range(6)), + ids=list(map(str, range(6))), labels=["Comida", "Comida", "Trago", "Trago", "Postre", "Postre"] ) diff --git a/textar/__init__.py b/textar/__init__.py index 7688f12..e669d40 100644 --- a/textar/__init__.py +++ b/textar/__init__.py @@ -4,4 +4,4 @@ __email__ = 'datos@modernizacion.gob.ar' __version__ = '0.0.4' -from text_classifier import TextClassifier +from .text_classifier import TextClassifier diff --git a/textar/text_classifier.py b/textar/text_classifier.py index 00da330..429cced 100644 --- a/textar/text_classifier.py +++ b/textar/text_classifier.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- + u"""Módulo de clasificación de textos. Este módulo contiene a los objetos que permiten entrenar un clasificador @@ -10,8 +11,6 @@ from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.metrics.pairwise import pairwise_distances from sklearn.linear_model import SGDClassifier -from sklearn.svm import LinearSVC -from scipy import sparse import pandas as pd import numpy as np import os @@ -47,7 +46,7 @@ def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'): input='content', encoding=encoding, decode_error='strict', strip_accents='ascii', lowercase=True, preprocessor=None, tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1), - analyzer='word', max_df=1.0, min_df=1, max_features=None, + analyzer='word', max_df=0.8, min_df=1, max_features=None, vocabulary=vocabulary, binary=False) self.transformer = TfidfTransformer() @@ -103,7 +102,7 @@ def retrain(self, name, ids, labels): except AttributeError: raise AttributeError("No hay ningun clasificador con ese nombre.") indices = np.in1d(self.ids, ids) - if isinstance(labels, basestring): + if isinstance(labels, str): labels = [labels] classifier.partial_fit(self.tfidf_mat[indices, :], labels) @@ -147,7 +146,7 @@ def _make_text_vectors(self, examples): El tamaño de la matriz es de (N, T) donde N es la cantidad de ejemplos y T es la cantidad de términos en el vocabulario. """ - if isinstance(examples, basestring): + if isinstance(examples, str): if examples in self.ids: textvec = self.tfidf_mat[self.ids == examples, :] else: @@ -168,7 +167,8 @@ def _make_text_vectors(self, examples): return textvec def get_similar(self, example, max_similars=3, similarity_cutoff=None, - term_diff_cutoff=0.6): + term_diff_max_rank=10, filter_list=None, + term_diff_cutoff=None): """Devuelve textos similares al ejemplo dentro de los textos entrenados. Nota: @@ -181,11 +181,14 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, devolver. similarity_cutoff (float, optional): Valor umbral de similaridad para definir que dos textos son similares entre si. - term_diff_cutoff (float, optional): Este valor sirve para controlar + term_diff_max_rank (int, optional): Este valor sirve para controlar el umbral con el que los terminos son considerados importantes a la hora de recuperar textos (no afecta el funcionamiento de que textos se consideran cercanos, solo la cantidad de terminos que se devuelven en best_words). + filter_list (list): Lista de ids de textos en la cual buscar textos + similares. + term_diff_cutoff (float): Deprecado. Se quitara en el futuro. Returns: tuple (list, list, list): (text_ids, sorted_dist, best_words) @@ -197,22 +200,46 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, palabras mas relevantes que se usaron para seleccionar esa sugerencia. """ - if max_similars > self.term_mat.shape[0]: + + if term_diff_cutoff: + warnings.warn('Deprecado. Quedo sin uso. Se quitara en el futuro.', + DeprecationWarning) + if filter_list: + if max_similars > len(filter_list): + raise ValueError("No se pueden pedir mas sugerencias que la \ + cantidad de textos en `filter_list`.") + else: + filt_idx = np.in1d(self.ids, filter_list) + + elif max_similars > self.term_mat.shape[0]: raise ValueError("No se pueden pedir mas sugerencias que la \ cantidad de textos que hay almacenados.") + else: + filt_idx = np.ones(len(self.ids), dtype=bool) + # Saco los textos compuestos solo por stop_words + good_ids = np.array(np.sum(self.term_mat, 1) > 0).squeeze() + filt_idx = filt_idx & good_ids if example in self.ids: index = self.ids == example exmpl_vec = self.tfidf_mat[index, :] - distances = np.squeeze(pairwise_distances(self.tfidf_mat, + distances = np.squeeze(pairwise_distances(self.tfidf_mat[filt_idx], exmpl_vec)) # Pongo la distancia a si mismo como inf, par que no se devuelva a # si mismo como una opcion - distances[index] = np.inf + if filter_list and example in filter_list: + distances[filter_list.index(example)] = np.inf + elif not filter_list: + idx_example = np.searchsorted(self.ids, example) + filt_idx_example = np.searchsorted(np.flatnonzero(filt_idx), + idx_example) + distances[filt_idx_example] = np.inf else: exmpl_vec = self.vectorizer.transform([example]) # contar terminos exmpl_vec = self.transformer.transform(exmpl_vec) # calcular tfidf - distances = np.squeeze(pairwise_distances(self.tfidf_mat, + distances = np.squeeze(pairwise_distances(self.tfidf_mat[filt_idx], exmpl_vec)) + if np.sum(exmpl_vec) == 0: + return [], [], [] sorted_indices = np.argsort(distances) closest_n = sorted_indices[:max_similars] sorted_dist = distances[closest_n] @@ -220,20 +247,26 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, closest_n = closest_n[sorted_dist < similarity_cutoff] sorted_dist = sorted_dist[sorted_dist < similarity_cutoff] best_words = [] - exmpl_vec = exmpl_vec.toarray() + # Calculo palabras relevantes para cada sugerencia + best_example = np.squeeze(exmpl_vec.toarray()) + sorted_example_weights = np.flipud(np.argsort(best_example)) + truncated_max_rank = min(term_diff_max_rank, np.sum(best_example > 0)) + best_example = sorted_example_weights[:truncated_max_rank] for suggested in closest_n: - test_vec = self.tfidf_mat[suggested, :].toarray() - differences = np.abs(exmpl_vec - test_vec)**2 / \ - (exmpl_vec**2 + test_vec**2) - differences = np.squeeze(np.array(differences)) - sort_I = np.argsort(differences) - limit = np.flatnonzero((differences[sort_I] > term_diff_cutoff) - | (np.isnan(differences[sort_I])) - )[0] + test_vec = np.squeeze(self.tfidf_mat[suggested, :].toarray()) + sorted_test_weights = np.flipud(np.argsort(test_vec)) + truncated_max_rank = min(term_diff_max_rank, + np.sum(test_vec > 0)) + best_test = sorted_test_weights[:truncated_max_rank] + best_words_ids = np.intersect1d(best_example, best_test) best_words.append([k for k, v in - self.vectorizer.vocabulary_.iteritems() - if v in sort_I[:limit]]) - text_ids = self.ids[closest_n] + self.vectorizer.vocabulary_.items() + if v in best_words_ids]) + if filter_list: + filt_idx_to_general_idx = np.flatnonzero(filt_idx) + text_ids = self.ids[filt_idx_to_general_idx[closest_n]] + else: + text_ids = self.ids[closest_n] return list(text_ids), list(sorted_dist), best_words def reload_texts(self, texts, ids, vocabulary=None): @@ -307,5 +340,5 @@ def _check_id_length(self, ids): ingresado textos planos en lugar de ids.") def _check_repeated_ids(self, ids): - if length(np.unique(ids)) != length(ids): + if len(np.unique(ids)) != len(ids): raise ValueError("Hay ids repetidos.")