Skip to content

Commit

Permalink
Merge pull request #3 from datosgobar/dev-nacho
Browse files Browse the repository at this point in the history
Dev nacho
  • Loading branch information
meliascosta authored Jun 13, 2017
2 parents f9725b1 + c5180c2 commit 3bfe582
Show file tree
Hide file tree
Showing 5 changed files with 301 additions and 32 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ var/
pip-log.txt
pip-delete-this-directory.txt

#Test de performance
tests/data/performance_data/

# Unit test / coverage reports
htmlcov/
.tox/
Expand Down
230 changes: 230 additions & 0 deletions tests/Pruebas de Performance Textar.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext memory_profiler\n",
"from textar import TextClassifier\n",
"import xml.etree.ElementTree as ET\n",
"from lxml import etree\n",
"import numpy as np\n",
"import re\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# Helper funcs\n",
"\n",
"def parse_blog(tree, min_words=100):\n",
" dates = []\n",
" posts = []\n",
" for elem in tree:\n",
" post = None\n",
" if elem.tag == 'date':\n",
" date = elem.text\n",
" elif elem.tag == 'post':\n",
" post = elem.text\n",
" if post is not None: \n",
" words = re.findall('\\w+\\W',post)\n",
" if len(words) > min_words and np.mean(map(len,words))>2:\n",
" dates.append(date)\n",
" posts.append(post)\n",
" return dates, posts"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Configs\n",
"DATA_FOLDER = os.path.join('.','data','performance_data','blogs')\n",
"MAX_FILES = 10000"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"magic = '''<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n",
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\" [\n",
" <!ENTITY nbsp ' '>\n",
" ]>'''\n",
"\n",
"parser = etree.XMLParser(recover=True)\n",
"\n",
"all_dates = []\n",
"all_posts = []\n",
"all_genders = []\n",
"all_ages = []\n",
"all_categories = []\n",
"\n",
"for file_name in os.listdir(DATA_FOLDER)[:MAX_FILES]:\n",
" id_f, gender, age, category, zodiac, ext = file_name.split('.')\n",
" with open(os.path.join(DATA_FOLDER, file_name), 'r') as f:\n",
" try:\n",
" tree = ET.fromstring(magic + f.read(), parser=parser)\n",
" dates, posts = parse_blog(tree)\n",
" all_posts += posts\n",
" all_dates += dates\n",
" all_genders += [gender] * len(dates)\n",
" all_ages += [age] * len(dates)\n",
" all_categories += [category] * len(dates)\n",
" except Exception as e:\n",
" pass\n",
" #print(\"Error en {:s}\".format(file_name))\n",
"all_ids = map(str, range(len(all_posts)))"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"%%timeit\n",
"# Tiempo de la creacion del objeto\n",
"tc = TextClassifier(all_posts, all_ids)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 3: 2.36 s per loop\n"
]
}
],
"source": [
"%%timeit\n",
"# Tiempo de la busqueda\n",
"tc.get_similar(all_ids[1],max_similars=3, term_diff_max_rank=50)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 3: 17.4 s per loop\n"
]
}
],
"source": [
"%%timeit\n",
"# Tiempo de creacion del clasificador\n",
"tc.make_classifier(\"topic\",all_ids, all_categories)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 31.4 ms per loop\n"
]
}
],
"source": [
"%%timeit\n",
"tc.classify(\"topic\", all_ids[1])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0., 0., 0., ..., 0., 0., 0.]])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"row.toarray()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['0', '1', '10', ..., '997', '998', '999'], \n",
" dtype='|S4')"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [Root]",
"language": "python",
"name": "Python [Root]"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
},
"notify_time": "5"
},
"nbformat": 4,
"nbformat_minor": 1
}
17 changes: 10 additions & 7 deletions tests/test_text_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@
import os
import codecs
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
sys.path.insert(0, os.path.abspath('..'))

from textar import TextClassifier


Expand All @@ -32,7 +31,7 @@ def test_get_similar(self):
"El edificio más antiguo tiene muchas cuadros caros porque era de un multimillonario",
"El edificio más moderno tiene muchas programadoras que comen manzanas durante el almuerzo grupal"
],
ids=map(str, range(4))
ids=list(map(str, range(4)))
)

ids, distancias, palabras_comunes = tc.get_similar(
Expand All @@ -42,10 +41,14 @@ def test_get_similar(self):

self.assertEqual(ids, ['0', '3', '2', '1'])
self.assertEqual(
palabras_comunes,
[
sorted(palabras)
for palabras in palabras_comunes
]
,
[
[u'edificio', u'manzanas'],
[u'edificio', u'muchas', u'manzanas'],
[u'edificio', u'manzanas', u'muchas'],
[u'edificio', u'muchas'], [u'muchas']
]
)
Expand All @@ -60,13 +63,13 @@ def test_classify(self):
"Para hacer una torta de naranja se necesita harina, huevos, leche, ralladura de naranja y polvo de hornear",
"Para hacer un lemon pie se necesita crema, ralladura de limón, huevos, leche y harina"
],
ids=map(str, range(6))
ids=list(map(str, range(6)))
)

# entrena un clasificador
tc.make_classifier(
name="recetas_classifier",
ids=map(str, range(6)),
ids=list(map(str, range(6))),
labels=["Comida", "Comida", "Trago", "Trago", "Postre", "Postre"]
)

Expand Down
2 changes: 1 addition & 1 deletion textar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
__email__ = 'datos@modernizacion.gob.ar'
__version__ = '0.0.4'

from text_classifier import TextClassifier
from .text_classifier import TextClassifier
Loading

0 comments on commit 3bfe582

Please sign in to comment.