Merge pull request #333 from datosgobar/331-analyze-search

Reorganizo el índice de metadatos
datosgobar · Aug 6, 2018 · 77b4b46 · 77b4b46
2 parents 18bd254 + d0b88b7
commit 77b4b46
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 17 deletions.
diff --git a/series_tiempo_ar_api/apps/metadata/constants.py b/series_tiempo_ar_api/apps/metadata/constants.py
@@ -14,3 +14,6 @@
     'dataset_source': 'dataset_source_keyword',
     'catalog_id': 'catalog_id',
 }
+
+
+ANALYZER = 'spanish_asciifold'
diff --git a/series_tiempo_ar_api/apps/metadata/indexer/doc_types.py b/series_tiempo_ar_api/apps/metadata/indexer/doc_types.py
@@ -3,29 +3,32 @@
 
 from series_tiempo_ar_api.apps.metadata import constants
 from series_tiempo_ar_api.libs.indexing.elastic import ElasticInstance
+from .index import get_fields_meta_index
 
 
 class Field(DocType):
     """ Formato de los docs de metadatos a indexar en ES."""
     title = Keyword()
-    description = Text()
+    description = Text(analyzer=constants.ANALYZER, copy_to='all')
     id = Keyword()
-    dataset_title = Text()
-    dataset_description = Text()
-    dataset_theme = Keyword()
-    units = Keyword()
-    dataset_publisher_name = Keyword()
-    catalog_id = Keyword()
+    dataset_title = Text(analyzer=constants.ANALYZER, copy_to='all')
+    dataset_description = Text(analyzer=constants.ANALYZER, copy_to='all')
+    dataset_theme = Keyword(copy_to='all')
+    units = Keyword(copy_to='all')
+    dataset_publisher_name = Keyword(copy_to='all')
+    catalog_id = Keyword(copy_to='all')
 
     # Guardamos una copia como keyword para poder usar en aggregations
-    dataset_source = Text()
+    dataset_source = Text(analyzer=constants.ANALYZER, copy_to='all')
     dataset_source_keyword = Keyword()
 
     periodicity = Keyword()
     start_date = Date()
     end_date = Date()
 
+    all = Text(analyzer=constants.ANALYZER)
+
     class Meta:
         dynamic = MetaField('strict')
-        index = constants.FIELDS_INDEX
+        index = get_fields_meta_index()._name
         using = ElasticInstance.get()
diff --git a/series_tiempo_ar_api/apps/metadata/indexer/index.py b/series_tiempo_ar_api/apps/metadata/indexer/index.py
@@ -0,0 +1,25 @@
+#! coding: utf-8
+from elasticsearch_dsl import Index, analyzer
+
+from series_tiempo_ar_api.apps.metadata import constants
+from series_tiempo_ar_api.libs.indexing.elastic import ElasticInstance
+
+
+def add_analyzer(index: Index):
+    """Agrega un nuevo analyzer al índice, disponible para ser usado
+    en todos sus fields. El analyzer aplica lower case + ascii fold:
+    quita acentos y uso de ñ, entre otros, para permitir búsqueda de
+    texto en español
+    """
+    index.analyzer(
+        analyzer(constants.ANALYZER,
+                 tokenizer='standard',
+                 filter=['lowercase', 'asciifolding'])
+    )
+
+
+def get_fields_meta_index():
+    fields_meta = Index(constants.FIELDS_INDEX, using=ElasticInstance.get())
+
+    add_analyzer(fields_meta)
+    return fields_meta
diff --git a/series_tiempo_ar_api/apps/metadata/indexer/metadata_indexer.py b/series_tiempo_ar_api/apps/metadata/indexer/metadata_indexer.py
@@ -3,26 +3,29 @@
 
 from django_rq import job
 
+from elasticsearch_dsl import Index
 from django_datajsonar.models import Node
-from series_tiempo_ar_api.apps.metadata.indexer.doc_types import Field
 from series_tiempo_ar_api.apps.metadata.models import IndexMetadataTask
 from series_tiempo_ar_api.libs.indexing.elastic import ElasticInstance
+from .doc_types import Field
 from .catalog_meta_indexer import CatalogMetadataIndexer
+from .index import get_fields_meta_index
 
 logger = logging.getLogger(__name__)
 
 
 class MetadataIndexer:
 
-    def __init__(self, task, doc_type=Field):
+    def __init__(self, task, doc_type=Field, index: Index = get_fields_meta_index()):
         self.task = task
         self.elastic = ElasticInstance.get()
+        self.index = index
         self.doc_type = doc_type
-        self.index = self.doc_type._doc_type.index
 
     def init_index(self):
-        if not self.elastic.indices.exists(self.index):
-            self.doc_type.init(using=self.elastic)
+        if not self.index.exists():
+            self.index.doc_type(self.doc_type)
+            self.index.create()
 
         # Actualizo el mapping por si se agregan nuevos campos
         self.doc_type._doc_type.refresh()

diff --git a/series_tiempo_ar_api/apps/metadata/queries/query.py b/series_tiempo_ar_api/apps/metadata/queries/query.py
@@ -57,7 +57,7 @@ def execute(self):
 
         querystring = self.args.get(constants.PARAM_QUERYSTRING)
         if querystring is not None:
-            search = search.query('match', _all=querystring)
+            search = search.query('match', all=querystring)
 
         offset = self.args[constants.PARAM_OFFSET]
         limit = self.args[constants.PARAM_LIMIT]

diff --git a/series_tiempo_ar_api/apps/metadata/tests/indexer_tests.py b/series_tiempo_ar_api/apps/metadata/tests/indexer_tests.py
@@ -4,30 +4,37 @@
 import os
 
 import faker
+from elasticsearch_dsl import Index
 from django.test import TestCase
 from django_datajsonar.tasks import read_datajson
 from django_datajsonar.models import ReadDataJsonTask, Node, Field as datajsonar_Field
 
 from series_tiempo_ar_api.apps.metadata.indexer.catalog_meta_indexer import CatalogMetadataIndexer
 from series_tiempo_ar_api.apps.metadata.indexer.doc_types import Field
+from series_tiempo_ar_api.apps.metadata.indexer.index import add_analyzer
 from series_tiempo_ar_api.apps.metadata.models import IndexMetadataTask
 from series_tiempo_ar_api.libs.indexing.elastic import ElasticInstance
 from series_tiempo_ar_api.apps.management import meta_keys
 SAMPLES_DIR = os.path.join(os.path.dirname(__file__), 'samples')
 
 fake = faker.Faker()
 
+fake_index = Index(fake.word(), using=ElasticInstance.get())
+add_analyzer(fake_index)
+
 
 class IndexerTests(TestCase):
 
     class FakeField(Field):
         class Meta:
-            index = fake.word()
+            index = fake_index._name
 
     def setUp(self):
         self.elastic = ElasticInstance.get()
         self.task = ReadDataJsonTask.objects.create()
         self.meta_task = IndexMetadataTask.objects.create()
+        fake_index.doc_type(self.FakeField)
+        fake_index.create()
         self.FakeField.init(using=self.elastic)
 
     def test_index(self):
@@ -64,4 +71,4 @@ def _index(self, catalog_id, catalog_url, set_availables=True):
         self.elastic.indices.forcemerge()
 
     def tearDown(self):
-        self.elastic.indices.delete(self.FakeField._doc_type.index)
+        fake_index.delete()