Skip to content

Commit

Permalink
Merge pull request #333 from datosgobar/331-analyze-search
Browse files Browse the repository at this point in the history
Reorganizo el índice de metadatos
  • Loading branch information
lucaslavandeira authored Aug 6, 2018
2 parents 18bd254 + d0b88b7 commit 77b4b46
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 17 deletions.
3 changes: 3 additions & 0 deletions series_tiempo_ar_api/apps/metadata/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@
'dataset_source': 'dataset_source_keyword',
'catalog_id': 'catalog_id',
}


ANALYZER = 'spanish_asciifold'
21 changes: 12 additions & 9 deletions series_tiempo_ar_api/apps/metadata/indexer/doc_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,32 @@

from series_tiempo_ar_api.apps.metadata import constants
from series_tiempo_ar_api.libs.indexing.elastic import ElasticInstance
from .index import get_fields_meta_index


class Field(DocType):
""" Formato de los docs de metadatos a indexar en ES."""
title = Keyword()
description = Text()
description = Text(analyzer=constants.ANALYZER, copy_to='all')
id = Keyword()
dataset_title = Text()
dataset_description = Text()
dataset_theme = Keyword()
units = Keyword()
dataset_publisher_name = Keyword()
catalog_id = Keyword()
dataset_title = Text(analyzer=constants.ANALYZER, copy_to='all')
dataset_description = Text(analyzer=constants.ANALYZER, copy_to='all')
dataset_theme = Keyword(copy_to='all')
units = Keyword(copy_to='all')
dataset_publisher_name = Keyword(copy_to='all')
catalog_id = Keyword(copy_to='all')

# Guardamos una copia como keyword para poder usar en aggregations
dataset_source = Text()
dataset_source = Text(analyzer=constants.ANALYZER, copy_to='all')
dataset_source_keyword = Keyword()

periodicity = Keyword()
start_date = Date()
end_date = Date()

all = Text(analyzer=constants.ANALYZER)

class Meta:
dynamic = MetaField('strict')
index = constants.FIELDS_INDEX
index = get_fields_meta_index()._name
using = ElasticInstance.get()
25 changes: 25 additions & 0 deletions series_tiempo_ar_api/apps/metadata/indexer/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#! coding: utf-8
from elasticsearch_dsl import Index, analyzer

from series_tiempo_ar_api.apps.metadata import constants
from series_tiempo_ar_api.libs.indexing.elastic import ElasticInstance


def add_analyzer(index: Index):
"""Agrega un nuevo analyzer al índice, disponible para ser usado
en todos sus fields. El analyzer aplica lower case + ascii fold:
quita acentos y uso de ñ, entre otros, para permitir búsqueda de
texto en español
"""
index.analyzer(
analyzer(constants.ANALYZER,
tokenizer='standard',
filter=['lowercase', 'asciifolding'])
)


def get_fields_meta_index():
fields_meta = Index(constants.FIELDS_INDEX, using=ElasticInstance.get())

add_analyzer(fields_meta)
return fields_meta
13 changes: 8 additions & 5 deletions series_tiempo_ar_api/apps/metadata/indexer/metadata_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,29 @@

from django_rq import job

from elasticsearch_dsl import Index
from django_datajsonar.models import Node
from series_tiempo_ar_api.apps.metadata.indexer.doc_types import Field
from series_tiempo_ar_api.apps.metadata.models import IndexMetadataTask
from series_tiempo_ar_api.libs.indexing.elastic import ElasticInstance
from .doc_types import Field
from .catalog_meta_indexer import CatalogMetadataIndexer
from .index import get_fields_meta_index

logger = logging.getLogger(__name__)


class MetadataIndexer:

def __init__(self, task, doc_type=Field):
def __init__(self, task, doc_type=Field, index: Index = get_fields_meta_index()):
self.task = task
self.elastic = ElasticInstance.get()
self.index = index
self.doc_type = doc_type
self.index = self.doc_type._doc_type.index

def init_index(self):
if not self.elastic.indices.exists(self.index):
self.doc_type.init(using=self.elastic)
if not self.index.exists():
self.index.doc_type(self.doc_type)
self.index.create()

# Actualizo el mapping por si se agregan nuevos campos
self.doc_type._doc_type.refresh()
Expand Down
2 changes: 1 addition & 1 deletion series_tiempo_ar_api/apps/metadata/queries/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def execute(self):

querystring = self.args.get(constants.PARAM_QUERYSTRING)
if querystring is not None:
search = search.query('match', _all=querystring)
search = search.query('match', all=querystring)

offset = self.args[constants.PARAM_OFFSET]
limit = self.args[constants.PARAM_LIMIT]
Expand Down
11 changes: 9 additions & 2 deletions series_tiempo_ar_api/apps/metadata/tests/indexer_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,37 @@
import os

import faker
from elasticsearch_dsl import Index
from django.test import TestCase
from django_datajsonar.tasks import read_datajson
from django_datajsonar.models import ReadDataJsonTask, Node, Field as datajsonar_Field

from series_tiempo_ar_api.apps.metadata.indexer.catalog_meta_indexer import CatalogMetadataIndexer
from series_tiempo_ar_api.apps.metadata.indexer.doc_types import Field
from series_tiempo_ar_api.apps.metadata.indexer.index import add_analyzer
from series_tiempo_ar_api.apps.metadata.models import IndexMetadataTask
from series_tiempo_ar_api.libs.indexing.elastic import ElasticInstance
from series_tiempo_ar_api.apps.management import meta_keys
SAMPLES_DIR = os.path.join(os.path.dirname(__file__), 'samples')

fake = faker.Faker()

fake_index = Index(fake.word(), using=ElasticInstance.get())
add_analyzer(fake_index)


class IndexerTests(TestCase):

class FakeField(Field):
class Meta:
index = fake.word()
index = fake_index._name

def setUp(self):
self.elastic = ElasticInstance.get()
self.task = ReadDataJsonTask.objects.create()
self.meta_task = IndexMetadataTask.objects.create()
fake_index.doc_type(self.FakeField)
fake_index.create()
self.FakeField.init(using=self.elastic)

def test_index(self):
Expand Down Expand Up @@ -64,4 +71,4 @@ def _index(self, catalog_id, catalog_url, set_availables=True):
self.elastic.indices.forcemerge()

def tearDown(self):
self.elastic.indices.delete(self.FakeField._doc_type.index)
fake_index.delete()

0 comments on commit 77b4b46

Please sign in to comment.