Skip to content

Commit

Permalink
Substituição da biblioteca python elasticsearch pelo opensearch (#65)
Browse files Browse the repository at this point in the history
Alterações feitas para refletir todas as alterações de ES para OS. Falta apenas o Makefile.

Makefile adaptado para Opensearch

Correções diversas.
  • Loading branch information
Giulio Carvalho authored Dec 6, 2023
2 parents ef087df + 8be7c91 commit 7bb0b2e
Show file tree
Hide file tree
Showing 15 changed files with 173 additions and 168 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.coverage
__pycache__
config/current.env
censo.csv
themes_config.json
49 changes: 25 additions & 24 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
IMAGE_NAMESPACE ?= serenata
IMAGE_NAMESPACE ?= okfn-brasil
IMAGE_NAME ?= querido-diario-api
IMAGE_TAG ?= latest
IMAGE_FORMAT ?= docker

# Elasticsearch ports
# Variables used to connect the app to the ElasticSearch
# Opensearch ports
# Variables used to connect the app to the OpenSearch
QUERIDO_DIARIO_DATABASE_CSV ?= censo.csv
ELASTICSEARCH_PORT1 ?= 9200
ELASTICSEARCH_PORT2 ?= 9300
OPENSEARCH_PORT1 ?= 9200
OPENSEARCH_PORT2 ?= 9300
# Containers data
POD_NAME ?= querido-diario-api
DATABASE_CONTAINER_NAME ?= $(POD_NAME)-db
ELASTICSEARCH_CONTAINER_NAME ?= $(POD_NAME)-elasticsearch
OPENSEARCH_CONTAINER_NAME ?= $(POD_NAME)-opensearch
# Database info user to run the tests
POSTGRES_USER ?= companies
POSTGRES_PASSWORD ?= companies
Expand All @@ -20,7 +20,7 @@ POSTGRES_HOST ?= localhost
POSTGRES_PORT ?= 5432
POSTGRES_IMAGE ?= docker.io/postgres:10
DATABASE_RESTORE_FILE ?= contrib/data/queridodiariodb.tar
# Run integration tests. Run local elasticsearch to validate the iteration
# Run integration tests. Run local opensearch to validate the iteration
RUN_INTEGRATION_TESTS ?= 0

API_PORT := 8080
Expand Down Expand Up @@ -72,20 +72,20 @@ destroy-pod:
podman pod rm --force --ignore $(POD_NAME)

create-pod: destroy-pod
cp --no-clobber config/sample.env config/current.env
-cp --no-clobber config/sample.env config/current.env
podman pod create --publish $(API_PORT):$(API_PORT) \
--publish $(ELASTICSEARCH_PORT1):$(ELASTICSEARCH_PORT1) \
--publish $(ELASTICSEARCH_PORT2):$(ELASTICSEARCH_PORT2) \
--publish $(POSTGRES_PORT):$(POSTGRES_PORT) \
--publish $(OPENSEARCH_PORT1):$(OPENSEARCH_PORT1) \
--publish $(OPENSEARCH_PORT2):$(OPENSEARCH_PORT2) \
--name $(POD_NAME)

set-test-variables:
$(eval POD_NAME=test-$(POD_NAME))
$(eval DATABASE_CONTAINER_NAME=test-$(DATABASE_CONTAINER_NAME))
$(eval API_PORT=8088)
$(eval ELASTICSEARCH_PORT1=9201)
$(eval ELASTICSEARCH_PORT2=9301)
$(eval ELASTICSEARCH_CONTAINER_NAME=test-$(ELASTICSEARCH_CONTAINER_NAME))
$(eval OPENSEARCH_PORT1=9201)
$(eval OPENSEARCH_PORT2=9301)
$(eval OPENSEARCH_CONTAINER_NAME=test-$(OPENSEARCH_CONTAINER_NAME))
$(eval QUERIDO_DIARIO_DATABASE_CSV="")

set-integration-test-variables: set-test-variables
Expand All @@ -99,14 +99,14 @@ retest: set-test-variables black
$(call run-command, python -m unittest discover tests)

.PHONY: test-all
test-all: set-integration-test-variables create-pod elasticsearch database retest
test-all: set-integration-test-variables create-pod opensearch database retest

.PHONY: test-shell
test-shell: set-test-variables
$(call run-command, bash)

.PHONY: coverage
coverage: set-test-variables create-pod elasticsearch database
coverage: set-test-variables create-pod opensearch database
$(call run-command, coverage erase)
$(call run-command, coverage run -m unittest tests)
$(call run-command, coverage report -m)
Expand All @@ -119,35 +119,36 @@ shell:
bash

.PHONY: run
run: create-pod elasticsearch database load-data rerun
run: create-pod opensearch database rerun

.PHONY:load-data
load-data:
$(call run-command, python scripts/load_fake_gazettes.py)


.PHONY: rerun
rerun: wait-elasticsearch wait-database
rerun: wait-opensearch wait-database
$(call run-command, python main)

.PHONY: runshell
runshell:
$(call run-command, bash)


elasticsearch: stop-elasticsearch start-elasticsearch wait-elasticsearch
opensearch: stop-opensearch start-opensearch wait-opensearch

start-elasticsearch:
start-opensearch:
podman run -d --rm -ti \
--name $(ELASTICSEARCH_CONTAINER_NAME) \
--name $(OPENSEARCH_CONTAINER_NAME) \
--pod $(POD_NAME) \
--env discovery.type=single-node \
elasticsearch:7.9.1
--env plugins.security.ssl.http.enabled=false \
opensearchproject/opensearch:2.9.0

stop-elasticsearch:
podman rm --force --ignore $(ELASTICSEARCH_CONTAINER_NAME)
stop-opensearch:
podman rm --force --ignore $(OPENSEARCH_CONTAINER_NAME)

wait-elasticsearch:
wait-opensearch:
$(call wait-for, localhost:9200)

.PHONY: stop-database
Expand Down
4 changes: 2 additions & 2 deletions api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ async def get_gazettes(
),
querystring: str = Query(
"",
description='Search in gazettes using ElasticSearch\'s "simple query string syntax" (an empty field returns no excerpts, only the results metadata).',
description='Search in gazettes using OpenSearch\'s "simple query string syntax" (an empty field returns no excerpts, only the results metadata).',
),
excerpt_size: int = Query(
500,
Expand Down Expand Up @@ -340,7 +340,7 @@ async def get_themed_excerpts(
),
querystring: str = Query(
"",
description='Search in excerpts using ElasticSearch\'s "simple query string syntax".',
description='Search in excerpts using OpenSearch\'s "simple query string syntax".',
),
pre_tags: List[str] = Query(
[""],
Expand Down
7 changes: 4 additions & 3 deletions config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class Configuration:
def __init__(self):
self.host = os.environ.get("QUERIDO_DIARIO_ELASTICSEARCH_HOST", "")
self.host = os.environ.get("QUERIDO_DIARIO_OPENSEARCH_HOST", "")
self.root_path = os.environ.get("QUERIDO_DIARIO_API_ROOT_PATH", "")
self.url_prefix = os.environ.get("QUERIDO_DIARIO_URL_PREFIX", "")
self.cors_allow_origins = Configuration._load_list(
Expand Down Expand Up @@ -45,7 +45,7 @@ def __init__(self):
"QUERIDO_DIARIO_SUGGESTION_MAILJET_CUSTOM_ID", ""
)
self.city_database_file = os.environ["CITY_DATABASE_CSV"]
self.gazette_index = os.environ.get("GAZETTE_ELASTICSEARCH_INDEX", "")
self.gazette_index = os.environ.get("GAZETTE_OPENSEARCH_INDEX", "")
self.gazette_content_field = os.environ.get("GAZETTE_CONTENT_FIELD", "")
self.gazette_content_exact_field_suffix = os.environ.get(
"GAZETTE_CONTENT_EXACT_FIELD_SUFFIX", ""
Expand Down Expand Up @@ -96,7 +96,8 @@ def __init__(self):
self.companies_database_user = os.environ.get("POSTGRES_USER", "")
self.companies_database_pass = os.environ.get("POSTGRES_PASSWORD", "")
self.companies_database_port = os.environ.get("POSTGRES_PORT", "")

self.opensearch_user = os.environ.get("QUERIDO_DIARIO_OPENSEARCH_USER", "")
self.opensearch_pswd = os.environ.get("QUERIDO_DIARIO_OPENSEARCH_PASSWORD", "")
@classmethod
def _load_list(cls, key, default=[]):
value = os.environ.get(key, default)
Expand Down
34 changes: 18 additions & 16 deletions config/sample.env
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
QUERIDO_DIARIO_ELASTICSEARCH_HOST=localhost
QUERIDO_DIARIO_OPENSEARCH_HOST=localhost
QUERIDO_DIARIO_OPENSEARCH_USER=admin
QUERIDO_DIARIO_OPENSEARCH_PASSWORD=admin
QUERIDO_DIARIO_SUGGESTION_MAILJET_REST_API_KEY=mailjet.com
QUERIDO_DIARIO_SUGGESTION_MAILJET_REST_API_SECRET=mailjet.com
QUERIDO_DIARIO_SUGGESTION_SENDER_NAME=Sender Name
Expand All @@ -12,21 +14,21 @@ POSTGRES_DB=companiesdb
POSTGRES_HOST=localhost
POSTGRES_PORT=5432
CITY_DATABASE_CSV=censo.csv
GAZETTE_ELASTICSEARCH_INDEX=gazettes
GAZETTE_CONTENT_FIELD=content_field
GAZETTE_CONTENT_EXACT_FIELD_SUFFIX=.field_suffix
GAZETTE_PUBLICATION_DATE_FIELD=publication_date_field
GAZETTE_SCRAPED_AT_FIELD=scraped_at_field
GAZETTE_TERRITORY_ID_FIELD=territory_id_field
GAZETTE_OPENSEARCH_INDEX=querido-diario
GAZETTE_CONTENT_FIELD=source_text
GAZETTE_CONTENT_EXACT_FIELD_SUFFIX=.exact
GAZETTE_PUBLICATION_DATE_FIELD=date
GAZETTE_SCRAPED_AT_FIELD=scraped_at
GAZETTE_TERRITORY_ID_FIELD=territory_id
THEMES_DATABASE_JSON=themes_config.json
THEMED_EXCERPT_CONTENT_FIELD=content_field
THEMED_EXCERPT_CONTENT_EXACT_FIELD_SUFFIX=.field_suffix
THEMED_EXCERPT_PUBLICATION_DATE_FIELD=publication_date_field
THEMED_EXCERPT_SCRAPED_AT_FIELD=scraped_at_field
THEMED_EXCERPT_TERRITORY_ID_FIELD=territory_id_field
THEMED_EXCERPT_ENTITIES_FIELD=entities_field
THEMED_EXCERPT_SUBTHEMES_FIELD=subthemes_field
THEMED_EXCERPT_EMBEDDING_SCORE_FIELD=embedding_score_field
THEMED_EXCERPT_TFIDF_SCORE_FIELD=tfidf_score_field
THEMED_EXCERPT_CONTENT_FIELD=excerpt
THEMED_EXCERPT_CONTENT_EXACT_FIELD_SUFFIX=.exact
THEMED_EXCERPT_PUBLICATION_DATE_FIELD=source_date
THEMED_EXCERPT_SCRAPED_AT_FIELD=source_scraped_at
THEMED_EXCERPT_TERRITORY_ID_FIELD=source_territory_id
THEMED_EXCERPT_ENTITIES_FIELD=excerpt_entities
THEMED_EXCERPT_SUBTHEMES_FIELD=excerpt_subthemes
THEMED_EXCERPT_EMBEDDING_SCORE_FIELD=excerpt_embedding_score
THEMED_EXCERPT_TFIDF_SCORE_FIELD=excerpt_tfidf_score
THEMED_EXCERPT_FRAGMENT_SIZE=10000
THEMED_EXCERPT_NUMBER_OF_FRAGMENTS=1
6 changes: 3 additions & 3 deletions docs/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ Já leu? Então vamos às informações específicas deste repositório:
| Serviço | [`themed_excerpts`](/themed_excerpts) | Consultas ao índices de busca textual temáticos do QD. | index |
| Módulo | [`database`](/database) | Classe de interação com bancos de dados Postgres. | Postgres |
| Módulo | [`config`](/config) | Configuração de variáveis de ambiente. | |
| Módulo | [`index`](/index) | Classe de interação com índices Elasticsearch. | Elasticsearch |
| Módulo | [`index`](/index) | Classe de interação com índices Opensearch. | Opensearch |
| Recurso | Postgres | Banco de dados de CNPJ. Contém informações sobre empresas e sócios cadastrados na Receita Federal. | |
| Recurso | Banco de dados do [Censo](https://censo.ok.org.br) | Banco de dados de municípios. Contém metadados municipais. | |
| Recurso | Elasticsearch | Índices de busca textual. | |
| Recurso | Opensearch | Índices de busca textual. | |
| Recurso | Mailjet | Serviço de envio de email. | |

## Como configurar o ambiente de desenvolvimento
Expand Down Expand Up @@ -69,4 +69,4 @@ make coverage
```

# Mantendo
As pessoas mantenedoras devem seguir as diretrizes do [Guia para Mantenedoras](https://github.com/okfn-brasil/querido-diario-comunidade/blob/main/.github/CONTRIBUTING.md#mantendo) do Querido Diário.
As pessoas mantenedoras devem seguir as diretrizes do [Guia para Mantenedoras](https://github.com/okfn-brasil/querido-diario-comunidade/blob/main/.github/CONTRIBUTING.md#mantendo) do Querido Diário.
3 changes: 1 addition & 2 deletions gazettes/gazette_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Dict, List, Union

from index import SearchEngineInterface
from index.elasticsearch import (
from index.opensearch import (
QueryBuilderInterface,
DateRangeQueryMixin,
SimpleStringQueryMixin,
Expand Down Expand Up @@ -395,7 +395,6 @@ def create_gazettes_data_gateway(
raise Exception(
"Query builder should implement the QueryBuilderInterface interface"
)

return GazetteSearchEngineGateway(search_engine, query_builder, index)


Expand Down
2 changes: 1 addition & 1 deletion index/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .elasticsearch import create_search_engine_interface, SearchEngineInterface
from .opensearch import create_search_engine_interface, SearchEngineInterface
24 changes: 13 additions & 11 deletions index/elasticsearch.py → index/opensearch.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import abc
import os
import re
from datetime import date
from enum import Enum, unique
from typing import Dict, List, Union
from typing import Dict, List, Tuple, Union

import elasticsearch
import opensearchpy


class SearchEngineInterface(abc.ABC):
Expand All @@ -15,7 +16,7 @@ class SearchEngineInterface(abc.ABC):
@abc.abstractmethod
def search(self, query: Dict, index: str = "", timeout: int = 30) -> Dict:
"""
Searches the index with the provided elasticsearch_dsl.Search
Searches the index with the provided opensearch_dsl.Search
"""

@abc.abstractmethod
Expand All @@ -25,20 +26,20 @@ def index_exists(self, index: str) -> bool:
"""


class ElasticSearch(SearchEngineInterface):
def __init__(self, host: str, default_index: str = ""):
self._es = elasticsearch.Elasticsearch(hosts=[host])
class OpenSearch(SearchEngineInterface):
def __init__(self, host: str, credentials: Tuple[str, str]=("user", "pswd"), default_index: str = ""):
self._search_engine = opensearchpy.OpenSearch(hosts=[host], http_auth=credentials)
self._default_index = default_index

def search(self, query: Dict, index: str = "", timeout: int = 30) -> Dict:
index_name = self._get_index_name(index)
response = self._es.search(
response = self._search_engine.search(
index=index_name, body=query, request_timeout=timeout
)
return response

def index_exists(self, index: str) -> bool:
return self._es.indices.exists(index=index)
return self._search_engine.indices.exists(index=index)

def _get_index_name(self, index: str) -> str:
index_name = index if self._is_valid_index_name(index) else self._default_index
Expand All @@ -48,7 +49,7 @@ def _get_index_name(self, index: str) -> str:

def _is_valid_index_name(self, index: str) -> bool:
return isinstance(index, str) and len(index) > 0


class QueryBuilderInterface(abc.ABC):
@abc.abstractmethod
Expand Down Expand Up @@ -213,10 +214,11 @@ def build_field_highlight(


def create_search_engine_interface(
host: str = "", default_index: str = ""
host: str = "", credentials: Tuple[str, str]=("user", "pswd"), default_index: str = ""
) -> SearchEngineInterface:
if not isinstance(host, str) or len(host.strip()) == 0:
raise Exception("Missing host")
if not isinstance(default_index, str):
raise Exception("Invalid index name")
return ElasticSearch(host.strip(), default_index=default_index.strip())
return OpenSearch(host.strip(), credentials=credentials, default_index=default_index.strip())

2 changes: 1 addition & 1 deletion main/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
configuration = load_configuration()

search_engine = create_search_engine_interface(
configuration.host, configuration.gazette_index
configuration.host, (configuration.opensearch_user, configuration.opensearch_pswd), configuration.gazette_index
)

gazettes_query_builder = create_gazettes_query_builder(
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ black==19.10b0
coverage==5.2.1
dateparser==0.7.6
fastapi==0.61.0
requests==2.24.0
requests==2.30.0
uvicorn==0.11.8
psycopg2==2.8.5
SQLAlchemy==1.3.19
elasticsearch==7.9.1
opensearch-py==2.3.2
mailjet-rest==1.3.4
Loading

0 comments on commit 7bb0b2e

Please sign in to comment.