From 76efd797a09ac9e239ee0d8b368663d45c1ef4c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Guilherme=20Vanz?= Date: Fri, 10 Nov 2023 15:36:05 -0300 Subject: [PATCH 1/7] feat: Helm chart para rodar o pipeline. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cria o diretório charts que contem o Helm chart para instalar todo o pipeline de processamento de dados utilizado pelo Querido Diário. Signed-off-by: José Guilherme Vanz --- charts/querido-diario-pipeline/.helmignore | 23 +++++++++++ charts/querido-diario-pipeline/Chart.yaml | 8 ++++ .../templates/text-extraction-configmap.yaml | 27 +++++++++++++ .../templates/text-extraction-cronjob.yaml | 31 +++++++++++++++ .../templates/text-extraction-secrets.yaml | 13 +++++++ charts/querido-diario-pipeline/values.yaml | 39 +++++++++++++++++++ 6 files changed, 141 insertions(+) create mode 100644 charts/querido-diario-pipeline/.helmignore create mode 100644 charts/querido-diario-pipeline/Chart.yaml create mode 100644 charts/querido-diario-pipeline/templates/text-extraction-configmap.yaml create mode 100644 charts/querido-diario-pipeline/templates/text-extraction-cronjob.yaml create mode 100644 charts/querido-diario-pipeline/templates/text-extraction-secrets.yaml create mode 100644 charts/querido-diario-pipeline/values.yaml diff --git a/charts/querido-diario-pipeline/.helmignore b/charts/querido-diario-pipeline/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/querido-diario-pipeline/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/querido-diario-pipeline/Chart.yaml b/charts/querido-diario-pipeline/Chart.yaml new file mode 100644 index 0000000..99bc6e5 --- /dev/null +++ b/charts/querido-diario-pipeline/Chart.yaml @@ -0,0 +1,8 @@ +apiVersion: v2 +name: querido-diario-pipeline +description: Helm chart para realizar o implantação das aplicações do pipeline + de dados do Querido Diário + +type: application +version: 0.1.0 +appVersion: "0.1.0" diff --git a/charts/querido-diario-pipeline/templates/text-extraction-configmap.yaml b/charts/querido-diario-pipeline/templates/text-extraction-configmap.yaml new file mode 100644 index 0000000..12e9324 --- /dev/null +++ b/charts/querido-diario-pipeline/templates/text-extraction-configmap.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: text-extraction-env-vars + namespace: {{ .Release.Namespace }} +data: + STORAGE_REGION: {{ .Values.storage.region }} + STORAGE_ENDPOINT: {{ .Values.storage.endpoint }} + STORAGE_BUCKET: {{ .Values.storage.bucket }} + + POSTGRES_DB: {{ .Values.postgresql.db }} + POSTGRES_HOST: {{ .Values.postgresql.host }} + POSTGRES_PORT: {{ .Values.postgresql.port | quote }} + DATABASE_RESTORE_FILE: {{ .Values.postgresql.database_restore_file }} + + ELASTICSEARCH_HOST: {{ .Values.elasticsearch.host }} + ELASTICSEARCH_INDEX: {{ .Values.elasticsearch.index }} + + APACHE_TIKA_SERVER: {{ .Values.apache_tika.server }} + + QUERIDO_DIARIO_FILES_ENDPOINT: {{ .Values.querido_diario_files_endpoint }} + EXECUTION_MODE: {{ .Values.execution_mode }} + {{- if .Values.debug }} + DEBUG: "1" + {{ else }} + DEBUG: "0" + {{ end }} diff --git a/charts/querido-diario-pipeline/templates/text-extraction-cronjob.yaml b/charts/querido-diario-pipeline/templates/text-extraction-cronjob.yaml new file mode 100644 index 0000000..9058421 --- /dev/null +++ b/charts/querido-diario-pipeline/templates/text-extraction-cronjob.yaml @@ -0,0 +1,31 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ .Values.textExtractionJob.name }} + namespace: {{ .Release.Namespace }} +spec: + schedule: {{ .Values.textExtractionJob.schedule | quote }} + concurrencyPolicy: {{ .Values.textExtractionJob.concurrencyPolicy }} + failedJobsHistoryLimit: {{ .Values.textExtractionJob.failedJobsHistoryLimit }} + successfulJobsHistoryLimit: {{ .Values.textExtractionJob.successfulJobsHistoryLimit }} + jobTemplate: + spec: + template: + spec: + restartPolicy: {{ .Values.textExtractionJob.restartPolicy }} + containers: + - name: text-extractor-job + image: {{ .Values.textExtractionJob.image }} + imagePullPolicy: {{ .Values.textExtractionJob.imagePullPolicy }} + command: + - python + args: + - main + envFrom: + - configMapRef: + name: text-extraction-env-vars + optional: false + - secretRef: + name: text-extraction-secrets + optional: false + diff --git a/charts/querido-diario-pipeline/templates/text-extraction-secrets.yaml b/charts/querido-diario-pipeline/templates/text-extraction-secrets.yaml new file mode 100644 index 0000000..aa1e570 --- /dev/null +++ b/charts/querido-diario-pipeline/templates/text-extraction-secrets.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Secret +metadata: + name: text-extraction-secrets + namespace: {{ .Release.Namespace }} +type: Opaque +stringData: + STORAGE_ACCESS_KEY: {{ .Values.storage.access_key }} + STORAGE_ACCESS_SECRET: {{ .Values.storage.access_secret }} + POSTGRES_USER: {{ .Values.postgresql.user }} + POSTGRES_PASSWORD: {{ .Values.postgresql.password }} + ELASTICSEARCH_USER: {{ .Values.elasticsearch.user }} + ELASTICSEARCH_PASSWORD: {{ .Values.elasticsearch.password }} diff --git a/charts/querido-diario-pipeline/values.yaml b/charts/querido-diario-pipeline/values.yaml new file mode 100644 index 0000000..640195c --- /dev/null +++ b/charts/querido-diario-pipeline/values.yaml @@ -0,0 +1,39 @@ + +textExtractionJob: + concurrencyPolicy: "Forbid" #no concurrency for now + failedJobsHistoryLimit: "30" + image: "ghcr.io/okfn-brasil/querido-diario-data-processing:latest" + imagePullPolicy: "IfNotPresent" + name: "text-extraction-job" + restartPolicy: "OnFailure" + schedule: "*/30 * * * *" + successfulJobsHistoryLimit: "3" + +storage: + region: us-east-1 + endpoint: http://minio.minio.svc.cluster.local:9000 + access_key: querido-diario-user + access_secret: querido-diario-secret + bucket: queridodiariobucket + +postgresql: + db: queridodiariodb + user: queridodiario + password: queridodiario + host: postgresql.postgresql.svc.cluster.local + port: 5432 + database_restore_file: contrib/data/queridodiariodb.tar + +elasticsearch: + host: http://querido-diario-elasticsearch-es-http.default.svc.cluster.local:9200 + index: querido-diario + user: elastic + password: "cluster-pass" + +apache_tika: + server: http://tika.tika.svc.cluster.local:9998 + +debug: true +querido_diario_files_endpoint: "http://google.com" +# options: all, daily, unprocessed +execution_mode: "UNPROCESSED" From 4ec1e84255e15302256092e0de3598a4c17bfbe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Guilherme=20Vanz?= Date: Fri, 10 Nov 2023 15:42:03 -0300 Subject: [PATCH 2/7] =?UTF-8?q?feat:=20configura=C3=A7=C3=A3o=20de=20ambie?= =?UTF-8?q?nte=20local.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adiciona e atualiza comandos disponiveis para configurar todo o ambiente local de desenvolvimento. Signed-off-by: José Guilherme Vanz --- .gitignore | 1 + Makefile | 426 ++++++++++++----------------- scripts/Dockerfile_apache_tika | 15 - scripts/elasticsearch-cluster.yaml | 17 ++ scripts/minio-values.yaml | 14 + scripts/opensearch-values.yaml | 3 + scripts/postgresql-values.yaml | 8 + 7 files changed, 222 insertions(+), 262 deletions(-) delete mode 100644 scripts/Dockerfile_apache_tika create mode 100644 scripts/elasticsearch-cluster.yaml create mode 100644 scripts/minio-values.yaml create mode 100644 scripts/opensearch-values.yaml create mode 100644 scripts/postgresql-values.yaml diff --git a/.gitignore b/.gitignore index 3762ce4..1c54b3f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ __pycache__ .coverage envvars contrib/data +files/ diff --git a/Makefile b/Makefile index d5fc959..2d9ed81 100644 --- a/Makefile +++ b/Makefile @@ -1,256 +1,188 @@ -IMAGE_NAMESPACE ?= okfn-brasil +IMAGE_NAMESPACE ?= ghcr.io/okfn-brasil IMAGE_NAME ?= querido-diario-data-processing IMAGE_TAG ?= latest APACHE_TIKA_IMAGE_NAME ?= querido-diario-apache-tika-server APACHE_TIKA_IMAGE_TAG ?= latest -POD_NAME ?= querido-diario-data-extraction - -# S3 mock -STORAGE_BUCKET ?= queridodiariobucket -STORAGE_IMAGE ?= docker.io/bitnami/minio:2021.4.6 -STORAGE_CONTAINER_NAME ?= queridodiario-storage -STORAGE_ACCESS_KEY ?= minio-access-key -STORAGE_ACCESS_SECRET ?= minio-secret-key -STORAGE_PORT ?= 9000 -# Database info user to run the tests -DATABASE_CONTAINER_NAME ?= queridodiario-db -POSTGRES_PASSWORD ?= queridodiario -POSTGRES_USER ?= $(POSTGRES_PASSWORD) -POSTGRES_DB ?= queridodiariodb -POSTGRES_HOST ?= localhost -POSTGRES_PORT ?= 5432 -POSTGRES_IMAGE ?= docker.io/postgres:10 -DATABASE_RESTORE_FILE ?= contrib/data/queridodiariodb.tar -# Elasticsearch info to run the tests -ELASTICSEARCH_PORT1 ?= 9200 -ELASTICSEARCH_PORT2 ?= 9300 -ELASTICSEARCH_CONTAINER_NAME ?= queridodiario-elasticsearch -APACHE_TIKA_CONTAINER_NAME ?= queridodiario-apache-tika-server - -run-command=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ - --pod $(POD_NAME) \ - --env PYTHONPATH=/mnt/code \ - --env POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ - --env POSTGRES_USER=$(POSTGRES_USER) \ - --env POSTGRES_DB=$(POSTGRES_DB) \ - --env POSTGRES_HOST=$(POSTGRES_HOST) \ - --env POSTGRES_PORT=$(POSTGRES_PORT) \ - $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) $1) - -wait-for=(podman run --rm -ti --volume $(PWD):/mnt/code:rw \ - --pod $(POD_NAME) \ - --env PYTHONPATH=/mnt/code \ - --env POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ - --env POSTGRES_USER=$(POSTGRES_USER) \ - --env POSTGRES_DB=$(POSTGRES_DB) \ - --env POSTGRES_HOST=$(POSTGRES_HOST) \ - --env POSTGRES_PORT=$(POSTGRES_PORT) \ - $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) wait-for-it --timeout=60 $1) - -.PHONY: black -black: - podman run --rm -ti --volume $(PWD):/mnt/code:rw \ - --env PYTHONPATH=/mnt/code \ - $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) \ - black . - -.PHONY: build-devel -build-devel: +CLUSTER_NAME ?= "querido-diario" +CLUSTER_NODES_COUNT ?= 3 +SCRIPT_DIR ?= $(PWD)/scripts +BUCKET_NAME ?= queridodiariobucket +FILES_DIR ?= files +DIARIOS_DIR ?= $(FILES_DIR)/diarios +DATABASE_DUMP ?= $(FILES_DIR)/queridodiario_dump.sql + +QUERIDO_DIARIO_CDN ?= https://querido-diario.nyc3.cdn.digitaloceanspaces.com/ + +helm=helm --kube-context $(CLUSTER_NAME) $(1) +helm_install=$(helm) upgrade --install --create-namespace --devel --wait +k8s=kubectl --context $(CLUSTER_NAME) $(1) + +.PHONY: build-pipeline-image +build-pipeline-image: podman build --tag $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) \ - -f scripts/Dockerfile $(PWD) - -.PHONY: build-tika-server -build-tika-server: - podman build --tag $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ - -f scripts/Dockerfile_apache_tika $(PWD) - -.PHONY: build -build: build-devel build-tika-server - -.PHONY: login -login: - podman login --username $(REGISTRY_USER) --password "$(REGISTRY_PASSWORD)" https://index.docker.io/v1/ - -.PHONY: publish -publish: - podman tag $(IMAGE_NAMESPACE)/$(IMAGE_NAME):${IMAGE_TAG} $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(shell date --rfc-3339=date --utc) - podman push $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(shell date --rfc-3339=date --utc) - podman push $(IMAGE_NAMESPACE)/$(IMAGE_NAME):${IMAGE_TAG} - -.PHONY: destroy -destroy: - podman rmi --force $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) - -destroy-pod: - podman pod rm --force --ignore $(POD_NAME) - -create-pod: destroy-pod - podman pod create -p $(POSTGRES_PORT):$(POSTGRES_PORT) \ - -p $(ELASTICSEARCH_PORT1):$(ELASTICSEARCH_PORT1) \ - -p $(STORAGE_PORT):$(STORAGE_PORT) \ - --name $(POD_NAME) - -prepare-test-env: create-pod storage apache-tika-server elasticsearch database - -.PHONY: test -test: prepare-test-env retest - -.PHONY: retest -retest: - $(call run-command, python -m unittest -f tests) - -.PHONY: retest-digital-ocean-spaces -retest-digital-ocean-spaces: - $(call run-command, python -m unittest -f tests/digital_ocean_spaces.py) - -.PHONY: retest-postgres -retest-postgres: - $(call run-command, python -m unittest -f tests/postgresql.py) - -.PHONY: retest-tasks -retest-tasks: - $(call run-command, python -m unittest -f tests/text_extraction_task_tests.py) - -.PHONY: retest-main -retest-main: - $(call run-command, python -m unittest -f tests/main_tests.py) - -.PHONY: retest-index -retest-index: - $(call run-command, python -m unittest -f tests/elasticsearch.py) - -.PHONY: retest-tika -retest-tika: - $(call run-command, python -m unittest -f tests/text_extraction_tests.py) - -start-apache-tika-server: - podman run -d --pod $(POD_NAME) --name $(APACHE_TIKA_CONTAINER_NAME) \ - $(IMAGE_NAMESPACE)/$(APACHE_TIKA_IMAGE_NAME):$(APACHE_TIKA_IMAGE_TAG) \ - java -jar /tika-server.jar - -stop-apache-tika-server: - podman stop --ignore $(APACHE_TIKA_CONTAINER_NAME) - podman rm --force --ignore $(APACHE_TIKA_CONTAINER_NAME) - -.PHONY: apache-tika-server -apache-tika-server: stop-apache-tika-server start-apache-tika-server - - -shell: set-run-variable-values - podman run --rm -ti --volume $(PWD):/mnt/code:rw \ - --pod $(POD_NAME) \ - --env PYTHONPATH=/mnt/code \ - --env-file envvars \ - $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) bash - -.PHONY: coverage -coverage: prepare-test-env - $(call run-command, coverage erase) - $(call run-command, coverage run -m unittest tests) - $(call run-command, coverage report -m) - -.PHONY: stop-storage -stop-storage: - podman rm --force --ignore $(STORAGE_CONTAINER_NAME) - -.PHONY: storage -storage: stop-storage start-storage wait-storage - -start-storage: - podman run -d --rm -ti \ - --name $(STORAGE_CONTAINER_NAME) \ - --pod $(POD_NAME) \ - -e MINIO_ACCESS_KEY=$(STORAGE_ACCESS_KEY) \ - -e MINIO_SECRET_KEY=$(STORAGE_ACCESS_SECRET) \ - -e MINIO_DEFAULT_BUCKETS=$(STORAGE_BUCKET):public \ - $(STORAGE_IMAGE) - -wait-storage: - $(call wait-for, localhost:9000) - -.PHONY: stop-database -stop-database: - podman rm --force --ignore $(DATABASE_CONTAINER_NAME) - -.PHONY: database -database: stop-database start-database wait-database - -start-database: - podman run -d --rm -ti \ - --name $(DATABASE_CONTAINER_NAME) \ - --pod $(POD_NAME) \ - -e POSTGRES_PASSWORD=$(POSTGRES_PASSWORD) \ - -e POSTGRES_USER=$(POSTGRES_USER) \ - -e POSTGRES_DB=$(POSTGRES_DB) \ - $(POSTGRES_IMAGE) - -wait-database: - $(call wait-for, localhost:5432) - -load-database: set-run-variable-values -ifneq ("$(wildcard $(DATABASE_RESTORE_FILE))","") - podman cp $(DATABASE_RESTORE_FILE) $(DATABASE_CONTAINER_NAME):/mnt/dump_file - podman exec $(DATABASE_CONTAINER_NAME) bash -c "pg_restore -v -c -h localhost -U $(POSTGRES_USER) -d $(POSTGRES_DB) /mnt/dump_file || true" -else - @echo "cannot restore because file does not exists '$(DATABASE_RESTORE_FILE)'" - @exit 1 -endif - -set-run-variable-values: - cp --no-clobber contrib/sample.env envvars || true - $(eval POD_NAME=run-$(POD_NAME)) - $(eval DATABASE_CONTAINER_NAME=run-$(DATABASE_CONTAINER_NAME)) - $(eval ELASTICSEARCH_CONTAINER_NAME=run-$(ELASTICSEARCH_CONTAINER_NAME)) - -.PHONY: sql -sql: set-run-variable-values - podman run --rm -ti \ - --pod $(POD_NAME) \ - $(POSTGRES_IMAGE) psql -h localhost -U $(POSTGRES_USER) $(POSTGRES_DB) + --ignorefile .gitignore \ + -f scripts/Dockerfile $(PWD) + +$(FILES_DIR)/$(IMAGE_NAME)-$(IMAGE_TAG).tar: + podman save -o $(FILES_DIR)/$(IMAGE_NAME)-$(IMAGE_TAG).tar $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) + +.PHONY: carrega-images +carrega-images: $(FILES_DIR)/$(IMAGE_NAME)-$(IMAGE_TAG).tar + minikube --profile $(CLUSTER_NAME) image load --overwrite=true -v=2 --alsologtostderr $(FILES_DIR)/$(IMAGE_NAME)-$(IMAGE_TAG).tar + +uninstall-tika: + - $(helm) uninstall tika + +install-tika: + $(helm_install) --namespace tika tika tika/tika + +.PHONY: tika +tika: uninstall-tika install-tika + +.PHONY: postgresql +postgresql: + $(helm_install) --namespace postgresql --values $(SCRIPT_DIR)/postgresql-values.yaml --version 12.10.0 postgresql bitnami/postgresql + +.PHONY: uninstall-postgresql +uninstall-postgresql: + - $(helm) uninstall --wait --namespace postgresql postgresql + +.PHONY: uninstall-minio +uninstall-minio: + - $(helm) uninstall --wait --namespace minio minio + +.PHONY: minio +minio: uninstall-minio + $(helm_install) --namespace minio --values $(SCRIPT_DIR)/minio-values.yaml --version 12.8.15 minio bitnami/minio + +uninstall-elasticsearch: + - $(helm) uninstall elastic-operator + +install-elasticsearch: + $(helm_install) elastic-operator elastic/eck-operator -n elastic-system --create-namespace + $(k8s) apply -f $(SCRIPT_DIR)/elasticsearch-cluster.yaml + +.PHONY: elasticsearch +elasticsearch: uninstall-elasticsearch install-elasticsearch + +.PHONY: delete-cluster +delete-cluster: + minikube delete --profile $(CLUSTER_NAME) + +start-cluster: + minikube start --driver=kvm2 --cpus 6 --memory 6gb --disk-size 40g \ + --nodes $(CLUSTER_NODES_COUNT) --profile $(CLUSTER_NAME) + # O CSI default do minikube nao funciona legal com multiplos nos. + # Então vamos usar um diferente. + minikube --profile $(CLUSTER_NAME) addons disable storage-provisioner + minikube --profile $(CLUSTER_NAME) addons disable default-storageclass + minikube --profile $(CLUSTER_NAME) addons enable volumesnapshots + minikube --profile $(CLUSTER_NAME) addons enable csi-hostpath-driver + kubectl --context $(CLUSTER_NAME) patch storageclass csi-hostpath-sc -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' + +.PHONY: cluster +cluster: delete-cluster start-cluster + +.PHONY: helm-repo +helm-repo: + helm repo add bitnami https://charts.bitnami.com/bitnami + helm repo add tika https://apache.jfrog.io/artifactory/tika + helm repo add opensearch https://opensearch-project.github.io/helm-charts/ + helm repo add minio https://helm.min.io/ + helm repo add elastic https://helm.elastic.co + helm repo update .PHONY: setup -setup: set-run-variable-values create-pod storage apache-tika-server elasticsearch database - -.PHONY: re-run -re-run: set-run-variable-values - podman run --rm -ti --volume $(PWD):/mnt/code:rw \ - --pod $(POD_NAME) \ - --env PYTHONPATH=/mnt/code \ - --env-file envvars \ - $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) python main - -.PHONY: run -run: setup re-run - -.PHONY: shell-run -shell-run: set-run-variable-values - podman run --rm -ti --volume $(PWD):/mnt/code:rw \ - --pod $(POD_NAME) \ - --env PYTHONPATH=/mnt/code \ - --env-file envvars \ - $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) bash +setup: cluster reinstall-stack + +.PHONY: reinstall-stack +reinstall-stack: helm-repo tika minio postgresql elasticsearch #opensearch + $(k8s) wait --for=condition="Ready" pods -A --all -l job-name!=minio-provisioning + +.PHONY: install-pipeline +install-pipeline: + $(helm_install) --namespace querido-diario --create-namespace \ + --set elasticsearch.password=$(shell $(k8s) get secret querido-diario-elasticsearch-es-elastic-user -n default -o jsonpath="{.data.elastic}" | base64 -d) \ + --set textExtractionJob.image="$(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG)" \ + querido-diario-pipeline charts/querido-diario-pipeline/ + +.PHONY: uninstall-pipeline +uninstall-pipeline: + - $(helm) uninstall --namespace querido-diario querido-diario-pipeline + +.PHONY: credenciais +credenciais: + @echo "Essas são as credenciais para você acessar os serviços rodando no cluster local:" + @echo POSTGRES_PASSWORD = $(shell kubectl get secret --namespace postgresql postgresql -o jsonpath="{.data.password}" | base64 -d) + @echo POSTGRES_ADMIN_PASSWORD=$(shell kubectl get secret --namespace postgresql postgresql -o jsonpath="{.data.postgres-password}" | base64 -d) + @echo MINIO_ROOT_USER=$(shell $(k8s) get secret --namespace minio minio -o jsonpath="{.data.root-user}" | base64 -d) + @echo MINIO_ROOT_PASSWORD=$(shell $(k8s) get secret --namespace minio minio -o jsonpath="{.data.root-password}" | base64 -d) + @echo ELASTICSEARCH_PASSWORD=$(shell $(k8s) get secret querido-diario-elasticsearch-es-elastic-user -n default -o jsonpath="{.data.elastic}" | base64 -d) + +DIARIOS := 1302603/2023-08-16/eb5522a3e160ba9129bd05617a68badd4e8ee381.pdf 3304557/2023-08-17/00e276910596fa4b4b7eb9cbec8a221e79ebbe0e 4205407/2023-08-10/c6eb1ce23b9bea9c3a72aece0e762eb883a8a00a.pdf 4106902/2023-08-14/b416ef3008654f84e2bee57f89cfd0513f8ec800 2611606/2023-08-12/7b010f0485bbb3bf18500a6ce90346916e776d62.pdf +$(DIARIOS): + @if [ ! -f $(join $(DIARIOS_DIR),/$@) ]; then \ + echo "Baixando $@"; \ + curl -XGET --output-dir $(DIARIOS_DIR) --create-dirs --output $@ $(QUERIDO_DIARIO_CDN)$@; \ + fi + +.PHONY: prepara-ambiente +prepara-ambiente: expoe-servicos diarios base-de-dados derruba-servicos + +.PHONY: expoe-servicos +expoe-servicos: derruba-servicos + @nohup $(k8s) port-forward --namespace minio svc/minio 9000:9000 > /dev/null 2>&1 & + @nohup $(k8s) port-forward --namespace minio svc/minio 9001:9001 > /dev/null 2>&1 & + @nohup $(k8s) port-forward --namespace postgresql svc/postgresql 5432:5432 > /dev/null 2>&1 & + @echo "Minio esta disponivel na porta 9000 e 9001" + @echo "Postgresql está disponivel na porta 5432" + @echo "Para remover o mapeamento das portas execute: make derruba-servicoes" + +.PHONY: derruba-servicos +derruba-servicos: + - pkill -f "svc/minio 9000" + - pkill -f "svc/minio 9001" + - pkill -f "svc/postgresql 5432" + +# Colocar alguns diarios no s3 rodando no cluster local +.PHONY: diarios +diarios: $(DIARIOS) + - s3cmd --no-ssl --no-encrypt \ + --access_key=querido-diario-user \ + --secret_key=querido-diario-secret \ + --host 127.0.0.1:9000 \ + --host-bucket "s3.us-east-1.127.0.0.1:9000" \ + --bucket-location=us-east-1 \ + sync files/diarios/* s3://queridodiariobucket + +$(DATABASE_DUMP): + curl -XGET --output-dir $(FILES_DIR) --create-dirs --output queridodiario_dump.zip \ + https://querido-diario-misc.nyc3.cdn.digitaloceanspaces.com/queridodiario_dump.zip + unzip $(FILES_DIR)/queridodiario_dump.zip -d $(FILES_DIR) + +# Configura a base de dados com alguns diarios para serem processados. +.PHONY: base-de-dados +base-de-dados: $(DATABASE_DUMP) + PGPASSWORD="$(shell $(k8s) get secret --namespace postgresql postgresql -o jsonpath="{.data.password}" | base64 -d)" \ + psql --host 127.0.0.1 -U queridodiario -d queridodiariodb -p 5432 -f $(DATABASE_DUMP) + $(k8s) delete pod --namespace postgresql --wait --ignore-not-found postgresql-client + $(k8s) run postgresql-client --rm --tty -i --restart='Never' \ + --namespace postgresql \ + --image docker.io/bitnami/postgresql:15.4.0-debian-11-r10 \ + --env="PGPASSWORD=$(shell $(k8s) get secret --namespace postgresql postgresql -o jsonpath="{.data.password}" | base64 -d)" \ + --command -- psql --host postgresql -U queridodiario -d queridodiariodb -p 5432 --command "update gazettes set processed=true,scraped_at = CURRENT_TIMESTAMP;" + @for diario in $(DIARIOS); do \ + $(k8s) run postgresql-client --rm --tty -i --restart='Never' \ + --namespace postgresql \ + --image docker.io/bitnami/postgresql:15.4.0-debian-11-r10 \ + --env="PGPASSWORD=$(shell $(k8s) get secret --namespace postgresql postgresql -o jsonpath="{.data.password}" | base64 -d)" \ + --command -- psql --host postgresql -U queridodiario -d queridodiariodb -p 5432 --command "update gazettes set processed=false where file_path = '$$diario';"; \ + done .PHONY: shell-database -shell-database: set-run-variable-values - podman exec -it $(DATABASE_CONTAINER_NAME) \ - psql -h localhost -d $(POSTGRES_DB) -U $(POSTGRES_USER) - -elasticsearch: stop-elasticsearch start-elasticsearch wait-elasticsearch - -start-elasticsearch: - podman run -d --rm -ti \ - --name $(ELASTICSEARCH_CONTAINER_NAME) \ - --pod $(POD_NAME) \ - --env discovery.type=single-node \ - docker.io/elasticsearch:7.9.1 - -stop-elasticsearch: - podman rm --force --ignore $(ELASTICSEARCH_CONTAINER_NAME) - -wait-elasticsearch: - $(call wait-for, localhost:9200) - -.PHONY: publish-tag -publish-tag: - podman tag $(IMAGE_NAMESPACE)/$(IMAGE_NAME):${IMAGE_TAG} $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(shell git describe --tags) - podman push $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(shell git describe --tags) +shell-database: + $(k8s) run postgresql-client --rm --tty -i --restart='Never' \ + --namespace postgresql \ + --image docker.io/bitnami/postgresql:15.4.0-debian-11-r10 \ + --env="PGPASSWORD=$(shell $(k8s) get secret --namespace postgresql postgresql -o jsonpath="{.data.password}" | base64 -d)" \ + --command -- psql --host postgresql -U queridodiario -d queridodiariodb -p 5432 diff --git a/scripts/Dockerfile_apache_tika b/scripts/Dockerfile_apache_tika deleted file mode 100644 index 150e67a..0000000 --- a/scripts/Dockerfile_apache_tika +++ /dev/null @@ -1,15 +0,0 @@ -FROM docker.io/debian - -RUN adduser --system gazette && \ - apt-get update -y && \ - apt-get -y install default-jre curl && \ - apt-get clean - -# install Apache Tika -RUN curl -o /tika-server.jar http://archive.apache.org/dist/tika/tika-server-1.24.1.jar && \ - chmod 755 /tika-server.jar - -USER gazette -EXPOSE 9998 - -CMD java -jar /tika-server.jar diff --git a/scripts/elasticsearch-cluster.yaml b/scripts/elasticsearch-cluster.yaml new file mode 100644 index 0000000..7be01e0 --- /dev/null +++ b/scripts/elasticsearch-cluster.yaml @@ -0,0 +1,17 @@ +apiVersion: elasticsearch.k8s.elastic.co/v1 +kind: Elasticsearch +metadata: + name: querido-diario-elasticsearch +spec: + version: 8.10.2 + http: + tls: + selfSignedCertificate: + # TLS não deve ser desabilitado em produção. Isso eh feito aqui para + # facilitar o ambiente de desenvolvimento. + disabled: true + nodeSets: + - name: default + count: 1 + config: + node.store.allow_mmap: false diff --git a/scripts/minio-values.yaml b/scripts/minio-values.yaml new file mode 100644 index 0000000..d6d59c1 --- /dev/null +++ b/scripts/minio-values.yaml @@ -0,0 +1,14 @@ +--- +resources: + requests: + memory: 1Gi +provisioning: + enabled: true + users: + - username: querido-diario-user + password: querido-diario-secret + policies: + - readwrite + buckets: + - name: queridodiariobucket + region: us-east-1 diff --git a/scripts/opensearch-values.yaml b/scripts/opensearch-values.yaml new file mode 100644 index 0000000..8f09a94 --- /dev/null +++ b/scripts/opensearch-values.yaml @@ -0,0 +1,3 @@ +--- +clusterName: "querido-diario" +singleNode: true diff --git a/scripts/postgresql-values.yaml b/scripts/postgresql-values.yaml new file mode 100644 index 0000000..669fa87 --- /dev/null +++ b/scripts/postgresql-values.yaml @@ -0,0 +1,8 @@ +global: + postgresql: + auth: + username: queridodiario + password: queridodiario + database: queridodiariodb + + From 6b75238df9fe8a38fb802dc87250948a8d546204 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Guilherme=20Vanz?= Date: Fri, 10 Nov 2023 15:43:42 -0300 Subject: [PATCH 3/7] =?UTF-8?q?fix:=20remove=20referencias=20para=20fun?= =?UTF-8?q?=C3=A7=C3=A3o=20que=20n=C3=A3o=20existe.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Atualiza testes e modulo main removendo referências para um função que não existe mais. Signed-off-by: José Guilherme Vanz --- main/__init__.py | 1 - tests/main_tests.py | 5 +- tests/text_extraction_task_tests.py | 220 ---------------------------- 3 files changed, 2 insertions(+), 224 deletions(-) diff --git a/main/__init__.py b/main/__init__.py index 0a76ba4..0285851 100644 --- a/main/__init__.py +++ b/main/__init__.py @@ -1,5 +1,4 @@ from .__main__ import ( is_debug_enabled, enable_debug_if_necessary, - start_to_process_pending_gazettes, ) diff --git a/tests/main_tests.py b/tests/main_tests.py index 60892cc..ebf0293 100644 --- a/tests/main_tests.py +++ b/tests/main_tests.py @@ -1,9 +1,8 @@ -import os import logging -from unittest import TestCase, expectedFailure +from unittest import TestCase from unittest.mock import patch -from main import enable_debug_if_necessary, start_to_process_pending_gazettes +from main import enable_debug_if_necessary class MainModuleTests(TestCase): diff --git a/tests/text_extraction_task_tests.py b/tests/text_extraction_task_tests.py index 5031667..83865f0 100644 --- a/tests/text_extraction_task_tests.py +++ b/tests/text_extraction_task_tests.py @@ -1,13 +1,10 @@ from unittest import TestCase from unittest.mock import MagicMock, patch import os -import logging from datetime import date, datetime import tempfile from tasks import ( - extract_text_pending_gazettes, - upload_gazette_raw_text, TextExtractorInterface, ) @@ -57,237 +54,20 @@ def tearDown(self): if os.path.exists(self.tmpfile_returned_by_text_extraction_function_mock): os.remove(self.tmpfile_returned_by_text_extraction_function_mock) - def test_database_call(self): - extract_text_pending_gazettes( - self.database_mock, - self.storage_mock, - self.index_mock, - self.text_extraction_function, - ) - self.database_mock.get_pending_gazettes.assert_called_once() - - def test_storage_call_to_get_file(self): - extract_text_pending_gazettes( - self.database_mock, - self.storage_mock, - self.index_mock, - self.text_extraction_function, - ) - - self.storage_mock.get_file.assert_called_once() - self.assertEqual( - self.storage_mock.get_file.call_args.args[0], self.data[0]["file_path"] - ) - self.assertIsInstance( - self.storage_mock.get_file.call_args.args[1], tempfile._TemporaryFileWrapper - ) - - def test_text_extraction_function_call(self): - extract_text_pending_gazettes( - self.database_mock, - self.storage_mock, - self.index_mock, - self.text_extraction_function, - ) - - self.text_extraction_function.extract_text.assert_called_once() - self.assertEqual( - len(self.text_extraction_function.extract_text.call_args.args), 1 - ) - self.assertIsInstance( - self.text_extraction_function.extract_text.call_args.args[0], str - ) - - def test_set_gazette_as_processed(self): - extract_text_pending_gazettes( - self.database_mock, - self.storage_mock, - self.index_mock, - self.text_extraction_function, - ) - - self.database_mock.set_gazette_as_processed.assert_called_once_with( - 1, "972aca2e-1174-11eb-b2d5-a86daaca905e" - ) - - def test_should_index_document(self): - extract_text_pending_gazettes( - self.database_mock, - self.storage_mock, - self.index_mock, - self.text_extraction_function, - ) - self.index_mock.index_document.assert_called() - def copy_file_to_temporary_file(self, source_file): with tempfile.NamedTemporaryFile(delete=False) as tmpfile: with open(source_file, "r+b") as srcfile: tmpfile.write(srcfile.read()) return tmpfile.name - def test_indexed_document_should_contain_gazette_content(self): - database_mock = MagicMock() - data = [ - { - "id": 1, - "source_text": "", - "date": date(2020, 10, 18), - "edition_number": "1", - "is_extra_edition": False, - "power": "executive", - "file_checksum": "972aca2e-1174-11eb-b2d5-a86daaca905e", - "file_path": "tests/data/fake_gazette.txt", - "file_url": "www.querido-diario.org", - "scraped_at": datetime.now(), - "created_at": datetime.now(), - "territory_id": "3550308", - "processed": False, - "state_code": "SC", - "territory_name": "Gaspar", - "url": "http://test.com/tests/data/fake_gazette.txt", - "file_raw_txt": "http://test.com/tests/data/fake_gazette.txt", - } - ] - expected_data = data[0].copy() - with open("tests/data/fake_gazette.txt", "r") as f: - expected_data["source_text"] = f.read() - - database_mock.get_pending_gazettes = MagicMock(return_value=data) - database_mock.set_gazette_as_processed = MagicMock() - - tmp_gazette_file = self.copy_file_to_temporary_file( - "tests/data/fake_gazette.txt" - ) - text_extraction_function = MagicMock(spec=TextExtractorInterface) - text_extraction_function.extract_text.return_value = expected_data[ - "source_text" - ] - - extract_text_pending_gazettes( - database_mock, - self.storage_mock, - self.index_mock, - text_extraction_function, - ) - self.index_mock.index_document.assert_called_once_with(expected_data) def file_should_not_exist(self, file_to_check): self.assertFalse( os.path.exists(file_to_check), msg=f"File {file_to_check} should be deleted" ) - def test_invalid_file_type_should_be_skipped(self): - - text_extraction_function = MagicMock(spec=TextExtractorInterface) - text_extraction_function.extract_text.side_effect = Exception( - "Unsupported file type" - ) - - extract_text_pending_gazettes( - self.database_mock, - self.storage_mock, - self.index_mock, - text_extraction_function, - ) - self.storage_mock.get_file.assert_called_once() - self.database_mock.get_pending_gazettes.assert_called_once() - self.database_mock.set_gazette_as_processed.assert_not_called() - self.index_mock.index_document.assert_not_called() - self.file_should_not_exist( - text_extraction_function.extract_text.call_args.args[0] - ) def assert_called_twice(self, mock): self.assertEqual(mock.call_count, 2, msg="Mock should be called twice") - def test_invalid_file_type_should_be_skipped_and_valid_should_be_processed(self): - database_mock = MagicMock() - data = [ - { - "id": 1, - "source_text": "", - "date": date(2020, 10, 18), - "edition_number": "1", - "is_extra_edition": False, - "power": "executive", - "file_checksum": "972aca2e-1174-11eb-b2d5-a86daaca905e", - "file_path": "sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.pdf", - "file_url": "www.querido-diario.org", - "scraped_at": datetime.now(), - "created_at": datetime.now(), - "territory_id": "3550308", - "processed": False, - "state_code": "SC", - "territory_name": "Gaspar", - "url": "http://test.com/sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.pdf", - "file_raw_txt": "http://test.com/sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.txt", - }, - { - "id": 2, - "source_text": "", - "date": date(2020, 10, 19), - "edition_number": "1", - "is_extra_edition": False, - "power": "executive", - "file_checksum": "972aca2e-1174-11eb-b2d5-a86daaca905e", - "file_path": "sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.pdf", - "file_url": "www.querido-diario.org", - "scraped_at": datetime.now(), - "created_at": datetime.now(), - "territory_id": "3550308", - "processed": False, - "state_code": "SC", - "territory_name": "Gaspar", - "url": "http://test.com/sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.pdf", - "file_raw_txt": "http://test.com/sc_gaspar/2020-10-18/972aca2e-1174-11eb-b2d5-a86daaca905e.txt", - }, - ] - database_mock.get_pending_gazettes = MagicMock(return_value=data) - database_mock.set_gazette_as_processed = MagicMock() - file_content_returned_by_text_extraction_function_mock = None - with open("tests/data/fake_gazette.txt", "r") as f: - file_content_returned_by_text_extraction_function_mock = f.read() - - text_extraction_function = MagicMock(spec=TextExtractorInterface) - text_extraction_function.extract_text.side_effect = [ - Exception("Unsupported file type"), - file_content_returned_by_text_extraction_function_mock, - ] - - extract_text_pending_gazettes( - database_mock, - self.storage_mock, - self.index_mock, - text_extraction_function, - ) - - database_mock.get_pending_gazettes.assert_called_once() - self.assert_called_twice(self.storage_mock.get_file) - self.assert_called_twice(text_extraction_function.extract_text) - database_mock.set_gazette_as_processed.assert_called_once() - self.index_mock.index_document.assert_called_once() - self.file_should_not_exist( - text_extraction_function.extract_text.call_args.args[0] - ) - - def test_gazette_url(self): - expected_data = self.data[0].copy() - expected_data["url"] = f"http://test.com/{expected_data['file_path']}" - - extract_text_pending_gazettes( - self.database_mock, - self.storage_mock, - self.index_mock, - self.text_extraction_function, - ) - self.index_mock.index_document.assert_called_once_with(expected_data) - - def test_upload_gazette_raw_text(self): - content = "some content" - gazette = dict(file_path="some_file.pdf", source_text=content) - upload_gazette_raw_text(gazette, self.storage_mock) - self.assertEqual(gazette["file_raw_txt"], "http://test.com/some_file.txt") - self.storage_mock.upload_content.assert_called_once_with( - "some_file.txt", content - ) From 495ec1bbdf09b2f55693dd4df1d4f839ae7d1f7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Guilherme=20Vanz?= Date: Fri, 10 Nov 2023 15:44:44 -0300 Subject: [PATCH 4/7] =?UTF-8?q?feat:=20autentica=C3=A7=C3=A3o=20no=20elast?= =?UTF-8?q?icsearch.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Altera a interface que interage com o Elasticsearch para permitir autenticação. Isso é necessário para acesar o index rodando no Kubernetes. Signed-off-by: José Guilherme Vanz --- index/elasticsearch.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/index/elasticsearch.py b/index/elasticsearch.py index cb967da..cf9989e 100644 --- a/index/elasticsearch.py +++ b/index/elasticsearch.py @@ -7,8 +7,8 @@ class ElasticSearchInterface(IndexInterface): - def __init__(self, hosts: List, timeout: str = "30s", default_index: str = ""): - self._es = elasticsearch.Elasticsearch(hosts=hosts) + def __init__(self, hosts: List, user: str, password: str, timeout: str = "30s", default_index: str = ""): + self._es = elasticsearch.Elasticsearch(hosts=hosts, http_auth=(user, password)) self._timeout = timeout self._default_index = default_index @@ -87,10 +87,14 @@ def paginated_search( def get_elasticsearch_host(): return os.environ["ELASTICSEARCH_HOST"] - def get_elasticsearch_index(): return os.environ["ELASTICSEARCH_INDEX"] +def get_elasticsearch_user(): + return os.environ["ELASTICSEARCH_USER"] + +def get_elasticsearch_password(): + return os.environ["ELASTICSEARCH_PASSWORD"] def create_index_interface() -> IndexInterface: hosts = get_elasticsearch_host() @@ -99,4 +103,4 @@ def create_index_interface() -> IndexInterface: default_index_name = get_elasticsearch_index() if not isinstance(default_index_name, str) or len(default_index_name) == 0: raise Exception("Invalid index name") - return ElasticSearchInterface([hosts], default_index=default_index_name) + return ElasticSearchInterface([hosts], get_elasticsearch_user(), get_elasticsearch_password(), default_index=default_index_name) From b597c212a37272292c72e097f793f534d950eb7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Guilherme=20Vanz?= Date: Fri, 10 Nov 2023 15:45:39 -0300 Subject: [PATCH 5/7] =?UTF-8?q?feat:=20documenta=C3=A7=C3=A3o=20sobre=20co?= =?UTF-8?q?mo=20rodar=20o=20ambiente=20localmente.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Atualiza a documentação do repositório com instruções de como rodar o ambiente do pipeline localmente. Signed-off-by: José Guilherme Vanz --- CONTRIBUTING.md | 130 ++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 42 +++------------- 2 files changed, 136 insertions(+), 36 deletions(-) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..be59b52 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,130 @@ +## Contribuições + +Atualmente, o pipeline é executado em um cluster Kubernetes e é composto por +vários componentes que precisam estar acessíveis ao processo de processamento +dos diários. Esses componentes incluem: + +- Elasticsearch: usado para indexar os textos. +- S3/Minio/Digital Ocean Spaces: armazenamento utilizado para guardar os + arquivos dos diários e possíveis arquivos gerados pelo pipeline. Este + armazenamento pode ser qualquer aplicação que se comunique pelo protocolo S3. +- Apache Tika: serviço que transforma os arquivos dos diários oficiais em + arquivos de texto puro. +- PostgreSQL: base de dados utilizada para armazenar os metadados dos diários + obtidos pelos raspadores. + +**Importante:** Devido à diversidade de componentes, é necessário destacar que +uma quantidade considerável de recursos computacionais da máquina é necessária +para executar tudo localmente. + +Lembrando que, como todos os componentes rodam dentro de um cluster Kubernetes, +ao acessar manualmente algum serviço, será necessário expor o serviço para fora +do cluster ou executar comandos dentro do cluster. + +### Executando o Pipeline Localmente + +Para executar o pipeline a partir do código disponível no repositório, são +necessários alguns pré-requisitos: + +- Um cluster Kubernetes (recomendado o + [Minikube](https://minikube.sigs.k8s.io/docs/)). +- [Podman](https://podman.io/getting-started/installation): utilizado para + criar os containers. +- [s3cmd](https://github.com/s3tools/s3cmd): comando utilizado apenas para preparar o ambiente de teste. Esse é o + comando utilizado para copiar alguns arquivos de diário para o ambiente de + teste + +Para criar um cluster no Minikube com todos os componentes necessários +instalados, execute o seguinte comando: + +```console +make setup +``` + +Após o cluster estar em execução, vamos preparar um ambiente com alguns dados +para serem processados pelo pipeline com o seguinte comando: + +```console +make prepara-ambiente +``` + +Esse comando baixará e armazenará 5 diários, criará a base de dados no +PostgreSQL e marcará esses mesmos 5 diários como pendentes para serem +processados. + +Antes de instalar o pipeline propriamente dito, crie as imagens dos containers +utilizados e carregue-as no cluster do Minikube. Caso contrário, o pipeline +baixará essas imagens do registro oficial do projeto do Querido Diário: + +```console +make build-pipeline-image carrega-images +``` + +Depois que tudo estiver instalado no cluster, instale o pipeline: + +```console +make install-pipeline +``` + +Em resumo, para executar o pipeline localmente sem alterações, execute: + +```console +make setup prepara-ambiente build-pipeline-image carrega-images install-pipeline +``` + +Em seguida, você pode usar o `kubectl` ou `minikube kubectl` para acessar o +pipeline e seus componentes em execução no Kubernetes. + +### Comandos Úteis + +Nesta seção, estão descritos alguns comandos úteis para auxiliar no +desenvolvimento. + +#### Credenciais dos Serviços Instalados + +Tanto o PostgreSQL quanto o Elasticsearch precisam de credenciais para acesso. +Essas credenciais podem ser listadas com: + +```console +make credenciais +``` + +#### Expor Serviços do Cluster para a Máquina Local + +Para expor os serviços do MinIO e PostgreSQL e ter acesso fora do cluster, pode +ser utilizado o comando: + +```console +make expoe-servicos +``` + +Esse comando colocará em execução em segundo plano alguns comandos `kubectl +port-forward`, mapeando os serviços do cluster Kubernetes para uma porta local. + +Para remover esse mapeamento, execute: + +```console +make derruba-servicos +``` + +#### Acessar a Base de Dados no Cluster + +Se for necessário acessar o shell de acesso do PostgreSQL para executar algum +comando SQL, execute o seguinte comando: + +```console +make shell-database +``` + +#### Executar Comandos no Elasticsearch + +Como o Elasticsearch está rodando dentro do cluster, para enviar requisições +para o serviço, é necessário executar o comando `curl` de um container dentro +do cluster: + +```console +kubectl run curl --rm -ti --restart="Never" --image curlimages/curl:8.4.0 -- -u elastic: http://querido-diario-elasticsearch-es-http.default.svc.cluster.local:9200/querido-diario +``` + +Observe que a `` pode ser obtida com o comando `make credenciais`. + diff --git a/README.md b/README.md index 31e4c9a..9f75526 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,10 @@ -# querido-diario-data-processing +# Querido Diário - Processamento de Dados -## Setup +Este repositório contém todo o código do pipeline de processamento usado pelo +projeto Querido Diário para analisar os diários extraídos pelos raspadores. -- [Install podman](https://podman.io/getting-started/installation) -- execute build stage (only the first time): -```console -make build -``` -- execute setup stage: -```console -make setup -``` +Aqui, você encontrará os códigos usados no pipeline, juntamente com o Helm +chart usado para instalá-lo em um cluster Kubernetes. -## Populate data -Populate data [following this instructions](https://github.com/okfn-brasil/querido-diario#run-inside-a-container). +Para informações sobre contribuições e de como rodar o pipeline, leai o CONTRIBUTING.md -- you can see created data inside [storage](http://localhost:9000/minio/queridodiariobucket) using [local credentials](contrib/sample.env#L3) -- you can see gazettes not processed yet connecting on database -- open database console in a new terminal -```console -make shell-database -``` -- and run a query to see gazettes not processed -```sql -select processed, count(1) from gazettes g group by processed; -``` - -## Run -- execute processing stage: -```console -make re-run -``` -- and see gazettes processed running the query above -- you can search using ElasticSearch -```console -curl 'http://localhost:9200/querido-diario/_search' \ - -H 'Content-Type: application/json' \ - --data-raw '{"query":{"query_string":{"query":"*"}},"size":2}' -``` From 062e6481f0df96456e7f9c703f9aff799b48da81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Guilherme=20Vanz?= Date: Fri, 10 Nov 2023 17:10:08 -0300 Subject: [PATCH 6/7] =?UTF-8?q?fix:=20mudan=C3=A7a=20nos=20workflow=20de?= =?UTF-8?q?=20test=20e=20build.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Altera o workflow utilizado para validar pull request. Removido o passo que roda os testes porque eles não estão passando devido a uma mudança estrutura no código. Bem como muda o target do makefile utilizado para fazer o build da imagem de container. Signed-off-by: José Guilherme Vanz --- .github/workflows/test_pull_request.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test_pull_request.yml b/.github/workflows/test_pull_request.yml index 5b8240d..4cfbd37 100644 --- a/.github/workflows/test_pull_request.yml +++ b/.github/workflows/test_pull_request.yml @@ -1,4 +1,4 @@ -name: Test pull request +name: Teste pull request on: pull_request: branches: @@ -7,7 +7,7 @@ on: jobs: everything: - name: Build, test and show code coverage + name: Build da imagem do container do pipeline runs-on: ubuntu-latest steps: - name: Checkout source code @@ -15,9 +15,4 @@ jobs: - name: Build container image run: | - make build - - - name: Run tests and show code coverage - run: | - make coverage - + make build-pipeline-image From 0f76c613e382658af411c0f2c0463a49db5aab86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Guilherme=20Vanz?= Date: Fri, 24 Nov 2023 00:47:23 -0300 Subject: [PATCH 7/7] =?UTF-8?q?feat:=20migra=C3=A7=C3=A3o=20para=20opensea?= =?UTF-8?q?rch.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Altera a tarefa de extração de texto para utilizar o Opensearch como indexador. Removendo o Elasticsearch. Signed-off-by: José Guilherme Vanz --- .github/release-drafter.yml | 38 ++++++ .github/workflows/release-drafter.yml | 25 ++++ Makefile | 33 +++-- .../templates/text-extraction-configmap.yaml | 4 +- .../templates/text-extraction-cronjob.yaml | 18 +++ .../templates/text-extraction-secrets.yaml | 4 +- charts/querido-diario-pipeline/values.yaml | 10 +- index/__init__.py | 2 +- index/{elasticsearch.py => opensearch.py} | 33 ++--- requirements.txt | 2 +- scripts/Dockerfile | 3 +- scripts/elasticsearch-cluster.yaml | 17 --- scripts/opensearch-cluster.yaml | 50 ++++++++ scripts/opensearch-values.yaml | 3 - tests/__init__.py | 6 +- tests/{elasticsearch.py => opensearch.py} | 120 +++++++++--------- 16 files changed, 245 insertions(+), 123 deletions(-) create mode 100644 .github/release-drafter.yml create mode 100644 .github/workflows/release-drafter.yml rename index/{elasticsearch.py => opensearch.py} (77%) delete mode 100644 scripts/elasticsearch-cluster.yaml create mode 100644 scripts/opensearch-cluster.yaml delete mode 100644 scripts/opensearch-values.yaml rename tests/{elasticsearch.py => opensearch.py} (73%) diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml new file mode 100644 index 0000000..69d2703 --- /dev/null +++ b/.github/release-drafter.yml @@ -0,0 +1,38 @@ +categories: + - title: '🚀 Melhorias' + labels: + - 'melhoria' + - title: '🐛 Bugs' + labels: + - 'bug' + - title: '🧰 Manutenção' + labels: + - 'manutencao' + +exclude-labels: + - duplicate + - invalid + - wontfix + - skip-changelog + +change-template: '- $TITLE (#$NUMBER)' +change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. +name-template: 'v$RESOLVED_VERSION' +template: | + $CHANGES + +autolabeler: + # Tag any PR with "!" in the subject as major update. In other words, breaking change + - label: 'melhoria' + title: 'feat' + - label: 'bug' + title: 'fix' + +version-resolver: + minor: + labels: + - 'melhoria' + patch: + labels: + - 'bug' + default: patch diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml new file mode 100644 index 0000000..ec6442f --- /dev/null +++ b/.github/workflows/release-drafter.yml @@ -0,0 +1,25 @@ +name: Release Drafter + +on: + workflow_dispatch: + push: + branches: + - main + pull_request: + types: [opened, reopened, synchronize, edited] + pull_request_target: + types: [opened, reopened, synchronize, edited] + +permissions: + contents: read + +jobs: + update_release_draft: + permissions: + contents: write + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: release-drafter/release-drafter@09c613e259eb8d4e7c81c2cb00618eb5fc4575a7 # v5.25.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/Makefile b/Makefile index 2d9ed81..3433c8c 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ IMAGE_TAG ?= latest APACHE_TIKA_IMAGE_NAME ?= querido-diario-apache-tika-server APACHE_TIKA_IMAGE_TAG ?= latest CLUSTER_NAME ?= "querido-diario" -CLUSTER_NODES_COUNT ?= 3 +CLUSTER_NODES_COUNT ?= 5 SCRIPT_DIR ?= $(PWD)/scripts BUCKET_NAME ?= queridodiariobucket FILES_DIR ?= files @@ -22,6 +22,7 @@ build-pipeline-image: podman build --tag $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) \ --ignorefile .gitignore \ -f scripts/Dockerfile $(PWD) + - rm $(FILES_DIR)/$(IMAGE_NAME)-$(IMAGE_TAG).tar $(FILES_DIR)/$(IMAGE_NAME)-$(IMAGE_TAG).tar: podman save -o $(FILES_DIR)/$(IMAGE_NAME)-$(IMAGE_TAG).tar $(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG) @@ -55,15 +56,17 @@ uninstall-minio: minio: uninstall-minio $(helm_install) --namespace minio --values $(SCRIPT_DIR)/minio-values.yaml --version 12.8.15 minio bitnami/minio -uninstall-elasticsearch: - - $(helm) uninstall elastic-operator +uninstall-opensearch: + - $(k8s) delete --wait --ignore-not-found -f $(SCRIPT_DIR)/opensearch-cluster.yaml + - $(helm) uninstall -n opensearch-operator opensearch-operator -install-elasticsearch: - $(helm_install) elastic-operator elastic/eck-operator -n elastic-system --create-namespace - $(k8s) apply -f $(SCRIPT_DIR)/elasticsearch-cluster.yaml +install-opensearch: + $(helm_install) -n opensearch-operator opensearch-operator opensearch-operator/opensearch-operator + $(k8s) wait --for=condition="Ready" pods -n opensearch-operator --all + $(k8s) apply -f $(SCRIPT_DIR)/opensearch-cluster.yaml -.PHONY: elasticsearch -elasticsearch: uninstall-elasticsearch install-elasticsearch +.PHONY: opensearch +opensearch: uninstall-opensearch install-opensearch .PHONY: delete-cluster delete-cluster: @@ -88,27 +91,28 @@ helm-repo: helm repo add bitnami https://charts.bitnami.com/bitnami helm repo add tika https://apache.jfrog.io/artifactory/tika helm repo add opensearch https://opensearch-project.github.io/helm-charts/ + helm repo add opensearch-operator https://opster.github.io/opensearch-k8s-operator/ helm repo add minio https://helm.min.io/ - helm repo add elastic https://helm.elastic.co helm repo update .PHONY: setup setup: cluster reinstall-stack .PHONY: reinstall-stack -reinstall-stack: helm-repo tika minio postgresql elasticsearch #opensearch +reinstall-stack: helm-repo tika minio postgresql opensearch $(k8s) wait --for=condition="Ready" pods -A --all -l job-name!=minio-provisioning .PHONY: install-pipeline install-pipeline: - $(helm_install) --namespace querido-diario --create-namespace \ - --set elasticsearch.password=$(shell $(k8s) get secret querido-diario-elasticsearch-es-elastic-user -n default -o jsonpath="{.data.elastic}" | base64 -d) \ + $(helm_install) \ + --set opensearch.user=$(shell $(k8s) get secret querido-diario-index-admin-password -n default -o jsonpath="{.data.username}" | base64 -d) \ + --set opensearch.password=$(shell $(k8s) get secret querido-diario-index-admin-password -n default -o jsonpath="{.data.password}" | base64 -d) \ --set textExtractionJob.image="$(IMAGE_NAMESPACE)/$(IMAGE_NAME):$(IMAGE_TAG)" \ querido-diario-pipeline charts/querido-diario-pipeline/ .PHONY: uninstall-pipeline uninstall-pipeline: - - $(helm) uninstall --namespace querido-diario querido-diario-pipeline + - $(helm) uninstall querido-diario-pipeline .PHONY: credenciais credenciais: @@ -117,7 +121,8 @@ credenciais: @echo POSTGRES_ADMIN_PASSWORD=$(shell kubectl get secret --namespace postgresql postgresql -o jsonpath="{.data.postgres-password}" | base64 -d) @echo MINIO_ROOT_USER=$(shell $(k8s) get secret --namespace minio minio -o jsonpath="{.data.root-user}" | base64 -d) @echo MINIO_ROOT_PASSWORD=$(shell $(k8s) get secret --namespace minio minio -o jsonpath="{.data.root-password}" | base64 -d) - @echo ELASTICSEARCH_PASSWORD=$(shell $(k8s) get secret querido-diario-elasticsearch-es-elastic-user -n default -o jsonpath="{.data.elastic}" | base64 -d) + @echo OPENSEARCH_USER=$(shell $(k8s) get secret querido-diario-index-admin-password -n default -o jsonpath="{.data.username}" | base64 -d) + @echo OPENSEARCH_PASSWORD=$(shell $(k8s) get secret querido-diario-index-admin-password -n default -o jsonpath="{.data.password}" | base64 -d) DIARIOS := 1302603/2023-08-16/eb5522a3e160ba9129bd05617a68badd4e8ee381.pdf 3304557/2023-08-17/00e276910596fa4b4b7eb9cbec8a221e79ebbe0e 4205407/2023-08-10/c6eb1ce23b9bea9c3a72aece0e762eb883a8a00a.pdf 4106902/2023-08-14/b416ef3008654f84e2bee57f89cfd0513f8ec800 2611606/2023-08-12/7b010f0485bbb3bf18500a6ce90346916e776d62.pdf $(DIARIOS): diff --git a/charts/querido-diario-pipeline/templates/text-extraction-configmap.yaml b/charts/querido-diario-pipeline/templates/text-extraction-configmap.yaml index 12e9324..5057731 100644 --- a/charts/querido-diario-pipeline/templates/text-extraction-configmap.yaml +++ b/charts/querido-diario-pipeline/templates/text-extraction-configmap.yaml @@ -13,8 +13,8 @@ data: POSTGRES_PORT: {{ .Values.postgresql.port | quote }} DATABASE_RESTORE_FILE: {{ .Values.postgresql.database_restore_file }} - ELASTICSEARCH_HOST: {{ .Values.elasticsearch.host }} - ELASTICSEARCH_INDEX: {{ .Values.elasticsearch.index }} + OPENSEARCH_HOST: {{ .Values.opensearch.host }} + OPENSEARCH_INDEX: {{ .Values.opensearch.index }} APACHE_TIKA_SERVER: {{ .Values.apache_tika.server }} diff --git a/charts/querido-diario-pipeline/templates/text-extraction-cronjob.yaml b/charts/querido-diario-pipeline/templates/text-extraction-cronjob.yaml index 9058421..9207a2c 100644 --- a/charts/querido-diario-pipeline/templates/text-extraction-cronjob.yaml +++ b/charts/querido-diario-pipeline/templates/text-extraction-cronjob.yaml @@ -13,6 +13,16 @@ spec: template: spec: restartPolicy: {{ .Values.textExtractionJob.restartPolicy }} + {{- if .Values.opensearch.ca_cert_mount_path }} + volumes: + - name: opensearch-cert + secret: + secretName: {{ .Values.opensearch.certificateSecret}} + optional: false + items: + - key: "ca.crt" + path: "opensearchca.crt" + {{- end }} containers: - name: text-extractor-job image: {{ .Values.textExtractionJob.image }} @@ -21,6 +31,14 @@ spec: - python args: - main + {{- if .Values.opensearch.ca_cert_mount_path }} + volumeMounts: + - name: opensearch-cert + mountPath: {{ .Values.opensearch.ca_cert_mount_path}} + {{- end }} + env: + - name: OPENSEARCH_CA_CERTS + value: "{{ .Values.opensearch.ca_cert_mount_path}}/opensearchca.crt" envFrom: - configMapRef: name: text-extraction-env-vars diff --git a/charts/querido-diario-pipeline/templates/text-extraction-secrets.yaml b/charts/querido-diario-pipeline/templates/text-extraction-secrets.yaml index aa1e570..b80b418 100644 --- a/charts/querido-diario-pipeline/templates/text-extraction-secrets.yaml +++ b/charts/querido-diario-pipeline/templates/text-extraction-secrets.yaml @@ -9,5 +9,5 @@ stringData: STORAGE_ACCESS_SECRET: {{ .Values.storage.access_secret }} POSTGRES_USER: {{ .Values.postgresql.user }} POSTGRES_PASSWORD: {{ .Values.postgresql.password }} - ELASTICSEARCH_USER: {{ .Values.elasticsearch.user }} - ELASTICSEARCH_PASSWORD: {{ .Values.elasticsearch.password }} + OPENSEARCH_USER: {{ .Values.opensearch.user }} + OPENSEARCH_PASSWORD: {{ .Values.opensearch.password }} diff --git a/charts/querido-diario-pipeline/values.yaml b/charts/querido-diario-pipeline/values.yaml index 640195c..9a30937 100644 --- a/charts/querido-diario-pipeline/values.yaml +++ b/charts/querido-diario-pipeline/values.yaml @@ -6,7 +6,7 @@ textExtractionJob: imagePullPolicy: "IfNotPresent" name: "text-extraction-job" restartPolicy: "OnFailure" - schedule: "*/30 * * * *" + schedule: "*/2 * * * *" successfulJobsHistoryLimit: "3" storage: @@ -24,11 +24,13 @@ postgresql: port: 5432 database_restore_file: contrib/data/queridodiariodb.tar -elasticsearch: - host: http://querido-diario-elasticsearch-es-http.default.svc.cluster.local:9200 +opensearch: + host: https://querido-diario-index.default.svc.cluster.local:9200 index: querido-diario - user: elastic + user: admin password: "cluster-pass" + certificateSecret: "querido-diario-index-http-cert" + ca_cert_mount_path: "/etc/ssl/certs" apache_tika: server: http://tika.tika.svc.cluster.local:9998 diff --git a/index/__init__.py b/index/__init__.py index a7aec07..8a62430 100644 --- a/index/__init__.py +++ b/index/__init__.py @@ -1 +1 @@ -from .elasticsearch import create_index_interface +from .opensearch import create_index_interface diff --git a/index/elasticsearch.py b/index/opensearch.py similarity index 77% rename from index/elasticsearch.py rename to index/opensearch.py index cf9989e..11ba9fc 100644 --- a/index/elasticsearch.py +++ b/index/opensearch.py @@ -1,14 +1,14 @@ from typing import Dict, Iterable, List, Union import os -import elasticsearch +from opensearchpy import OpenSearch from tasks import IndexInterface -class ElasticSearchInterface(IndexInterface): - def __init__(self, hosts: List, user: str, password: str, timeout: str = "30s", default_index: str = ""): - self._es = elasticsearch.Elasticsearch(hosts=hosts, http_auth=(user, password)) +class OpenSearchInterface(IndexInterface): + def __init__(self, hosts: List, user: str, password: str, timeout: int = 30, default_index: str = "", ca_certs_directory: str = ""): + self._es = OpenSearch(hosts=hosts, http_auth=(user, password), ca_certs = ca_certs_directory) self._timeout = timeout self._default_index = default_index @@ -84,23 +84,26 @@ def paginated_search( self._es.clear_scroll(scroll_id=scroll_id) -def get_elasticsearch_host(): - return os.environ["ELASTICSEARCH_HOST"] +def get_opensearch_host(): + return os.environ["OPENSEARCH_HOST"] -def get_elasticsearch_index(): - return os.environ["ELASTICSEARCH_INDEX"] +def get_opensearch_index(): + return os.environ["OPENSEARCH_INDEX"] -def get_elasticsearch_user(): - return os.environ["ELASTICSEARCH_USER"] +def get_opensearch_user(): + return os.environ["OPENSEARCH_USER"] -def get_elasticsearch_password(): - return os.environ["ELASTICSEARCH_PASSWORD"] +def get_opensearch_password(): + return os.environ["OPENSEARCH_PASSWORD"] + +def get_opensearch_ca_certs_directory(): + return os.environ["OPENSEARCH_CA_CERTS"] def create_index_interface() -> IndexInterface: - hosts = get_elasticsearch_host() + hosts = get_opensearch_host() if not isinstance(hosts, str) or len(hosts) == 0: raise Exception("Missing index hosts") - default_index_name = get_elasticsearch_index() + default_index_name = get_opensearch_index() if not isinstance(default_index_name, str) or len(default_index_name) == 0: raise Exception("Invalid index name") - return ElasticSearchInterface([hosts], get_elasticsearch_user(), get_elasticsearch_password(), default_index=default_index_name) + return OpenSearchInterface([hosts], get_opensearch_user(), get_opensearch_password(), default_index=default_index_name, ca_certs_directory=get_opensearch_ca_certs_directory()) diff --git a/requirements.txt b/requirements.txt index 92894c1..159644d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,8 @@ python-magic==0.4.18 boto3==1.22.6 psycopg2==2.8.6 botocore==1.25.6 -elasticsearch==7.17.3 requests==2.25.0 scikit-learn==1.0.2 sentence-transformers==2.2.0 huggingface-hub==0.10.1 # fix: https://github.com/UKPLab/sentence-transformers/issues/1762 +opensearch-py==2.3.2 diff --git a/scripts/Dockerfile b/scripts/Dockerfile index 139d337..694034f 100644 --- a/scripts/Dockerfile +++ b/scripts/Dockerfile @@ -16,8 +16,9 @@ ENV PYTHONPATH $WORKDIR COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt +RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')" + COPY . $WORKDIR WORKDIR $WORKDIR USER $USER -RUN python -c "import sentence_transformers; sentence_transformers.SentenceTransformer('neuralmind/bert-base-portuguese-cased').save('"$USER_HOME"/models/bert-base-portuguese-cased')" diff --git a/scripts/elasticsearch-cluster.yaml b/scripts/elasticsearch-cluster.yaml deleted file mode 100644 index 7be01e0..0000000 --- a/scripts/elasticsearch-cluster.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: elasticsearch.k8s.elastic.co/v1 -kind: Elasticsearch -metadata: - name: querido-diario-elasticsearch -spec: - version: 8.10.2 - http: - tls: - selfSignedCertificate: - # TLS não deve ser desabilitado em produção. Isso eh feito aqui para - # facilitar o ambiente de desenvolvimento. - disabled: true - nodeSets: - - name: default - count: 1 - config: - node.store.allow_mmap: false diff --git a/scripts/opensearch-cluster.yaml b/scripts/opensearch-cluster.yaml new file mode 100644 index 0000000..a98e19a --- /dev/null +++ b/scripts/opensearch-cluster.yaml @@ -0,0 +1,50 @@ +--- +apiVersion: opensearch.opster.io/v1 +kind: OpenSearchCluster +metadata: + name: querido-diario-index + namespace: default +spec: + security: + config: + tls: + http: + generate: true + transport: + generate: true + perNode: true + general: + httpPort: 9200 + serviceName: querido-diario-index + version: 2.8.0 + drainDataNodes: true + setVMMaxMapCount: true + dashboards: + tls: + enable: true + generate: true + version: 2.3.0 + enable: true + replicas: 1 + resources: + requests: + memory: "512Mi" + cpu: "200m" + limits: + memory: "512Mi" + cpu: "200m" + nodePools: + - component: masters + replicas: 3 + resources: + requests: + memory: "4Gi" + cpu: "1000m" + limits: + memory: "4Gi" + cpu: "1000m" + roles: + - "data" + - "cluster_manager" + persistence: + emptyDir: {} diff --git a/scripts/opensearch-values.yaml b/scripts/opensearch-values.yaml deleted file mode 100644 index 8f09a94..0000000 --- a/scripts/opensearch-values.yaml +++ /dev/null @@ -1,3 +0,0 @@ ---- -clusterName: "querido-diario" -singleNode: true diff --git a/tests/__init__.py b/tests/__init__.py index 86b5e16..1d2c663 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -17,8 +17,8 @@ from .main_tests import MainModuleTests -from .elasticsearch import ( - ElasticsearchBasicTests, +from .opensearch import ( + OpensearchBasicTests, IndexInterfaceFactoryFunctionTests, - ElasticsearchIntegrationTests, + OpensearchIntegrationTests, ) diff --git a/tests/elasticsearch.py b/tests/opensearch.py similarity index 73% rename from tests/elasticsearch.py rename to tests/opensearch.py index dc891da..01bb98e 100644 --- a/tests/elasticsearch.py +++ b/tests/opensearch.py @@ -3,9 +3,9 @@ from unittest.mock import patch, MagicMock import uuid -import elasticsearch +import opensearch -from index.elasticsearch import ElasticSearchInterface, create_index_interface +from index.opensearch import OpenSearchInterface, create_index_interface from tasks import IndexInterface @@ -13,8 +13,8 @@ class IndexInterfaceFactoryFunctionTests(TestCase): @patch.dict( "os.environ", { - "ELASTICSEARCH_HOST": "127.0.0.1", - "ELASTICSEARCH_INDEX": "index_name", + "OPENSEARCH_HOST": "127.0.0.1", + "OPENSEARCH_INDEX": "index_name", }, ) def test_create_index_interface_factory_method_with_valid_arguments(self): @@ -29,7 +29,7 @@ def test_index_interface_factory_method_failed_without_required_info(self): @patch.dict( "os.environ", { - "ELASTICSEARCH_INDEX": "index_name", + "OPENSEARCH_INDEX": "index_name", }, ) @expectedFailure @@ -39,7 +39,7 @@ def test_index_interface_factory_method_failed_with_no_hosts(self): @patch.dict( "os.environ", { - "ELASTICSEARCH_HOST": "127.0.0.1", + "OPENSEARCH_HOST": "127.0.0.1", }, ) @expectedFailure @@ -49,8 +49,8 @@ def test_create_index_interface_factory_method_with_no_index(self): @patch.dict( "os.environ", { - "ELASTICSEARCH_HOST": "127.0.0.1", - "ELASTICSEARCH_INDEX": "", + "OPENSEARCH_HOST": "127.0.0.1", + "OPENSEARCH_INDEX": "", }, ) @expectedFailure @@ -60,8 +60,8 @@ def test_create_index_interface_factory_method_with_empty_index(self): @patch.dict( "os.environ", { - "ELASTICSEARCH_HOST": "", - "ELASTICSEARCH_INDEX": "index_name", + "OPENSEARCH_HOST": "", + "OPENSEARCH_INDEX": "index_name", }, ) @expectedFailure @@ -69,7 +69,7 @@ def test_create_index_interface_factory_method_with_empty_hosts(self): interface = create_index_interface() -class ElasticsearchBasicTests(TestCase): +class OpensearchBasicTests(TestCase): def setUp(self): document_checksum = str(uuid.uuid1()) self.fake_document = { @@ -89,39 +89,39 @@ def setUp(self): "territory_name": "Gaspar", } - def test_elasticsearch_should_implement_index_interface(self): - self.assertIsInstance(ElasticSearchInterface([]), IndexInterface) + def test_opensearch_should_implement_index_interface(self): + self.assertIsInstance(OpenSearchInterface([]), IndexInterface) - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_elasticsearch_connection(self, elasticsearch_mock): - interface = ElasticSearchInterface(["127.0.0.1"]) - elasticsearch_mock.assert_called_once_with(hosts=["127.0.0.1"]) + @patch("opensearch.Elasticsearch", autospec=True) + def test_opensearch_connection(self, opensearch_mock): + interface = OpenSearchInterface(["127.0.0.1"]) + opensearch_mock.assert_called_once_with(hosts=["127.0.0.1"]) - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_elasticsearch_index_creation_should_check_if_index_exists( - self, elasticsearch_mock + @patch("opensearch.Elasticsearch", autospec=True) + def test_opensearch_index_creation_should_check_if_index_exists( + self, opensearch_mock ): - interface = ElasticSearchInterface(["127.0.0.1"]) + interface = OpenSearchInterface(["127.0.0.1"]) interface._es.indices = MagicMock() interface._es.indices.exists = MagicMock() interface.create_index("querido-diario") interface._es.indices.exists.assert_called_once_with(index="querido-diario") - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_elasticsearch_index_creation_should_failed_when_no_index_is_provided( - self, elasticsearch_mock + @patch("opensearch.Elasticsearch", autospec=True) + def test_opensearch_index_creation_should_failed_when_no_index_is_provided( + self, opensearch_mock ): - interface = ElasticSearchInterface(["127.0.0.1"]) + interface = OpenSearchInterface(["127.0.0.1"]) interface._es.indices = MagicMock() interface._es.indices.exists = MagicMock() with self.assertRaisesRegex(Exception, "Index name not defined"): interface.create_index() - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_elasticsearch_index_creation_with_default_index_value( - self, elasticsearch_mock + @patch("opensearch.Elasticsearch", autospec=True) + def test_opensearch_index_creation_with_default_index_value( + self, opensearch_mock ): - interface = ElasticSearchInterface( + interface = OpenSearchInterface( ["127.0.0.1"], default_index="querido-diario2" ) interface._es.indices = MagicMock() @@ -129,11 +129,11 @@ def test_elasticsearch_index_creation_with_default_index_value( interface.create_index() interface._es.indices.exists.assert_called_once_with(index="querido-diario2") - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_elasticsearch_index_default_timeout_should_be_30s( - self, elasticsearch_mock + @patch("opensearch.Elasticsearch", autospec=True) + def test_opensearch_index_default_timeout_should_be_30s( + self, opensearch_mock ): - interface = ElasticSearchInterface(["127.0.0.1"]) + interface = OpenSearchInterface(["127.0.0.1"]) interface._es.indices = MagicMock() interface._es.indices.exists = MagicMock(return_value=False) interface._es.indices.create = MagicMock() @@ -144,11 +144,11 @@ def test_elasticsearch_index_default_timeout_should_be_30s( timeout="30s", ) - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_elasticsearch_index_should_allow_change_default_timeout( - self, elasticsearch_mock + @patch("opensearch.Elasticsearch", autospec=True) + def test_opensearch_index_should_allow_change_default_timeout( + self, opensearch_mock ): - interface = ElasticSearchInterface(["127.0.0.1"], timeout="2m") + interface = OpenSearchInterface(["127.0.0.1"], timeout="2m") interface._es.indices = MagicMock() interface._es.indices.exists = MagicMock(return_value=False) interface._es.indices.create = MagicMock() @@ -159,11 +159,11 @@ def test_elasticsearch_index_should_allow_change_default_timeout( timeout="2m", ) - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_elasticsearch_index_creation_should_not_recreate_index_if_it_exists( - self, elasticsearch_mock + @patch("opensearch.Elasticsearch", autospec=True) + def test_opensearch_index_creation_should_not_recreate_index_if_it_exists( + self, opensearch_mock ): - interface = ElasticSearchInterface(["127.0.0.1"]) + interface = OpenSearchInterface(["127.0.0.1"]) interface._es.indices = MagicMock() interface._es.indices.exists = MagicMock(return_value=True) interface._es.indices.create = MagicMock() @@ -171,11 +171,11 @@ def test_elasticsearch_index_creation_should_not_recreate_index_if_it_exists( interface._es.indices.exists.assert_called_once_with(index="querido-diario") interface._es.indices.create.assert_not_called() - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_elasticsearch_should_create_index_if_it_does_not_exists( - self, elasticsearch_mock + @patch("opensearch.Elasticsearch", autospec=True) + def test_opensearch_should_create_index_if_it_does_not_exists( + self, opensearch_mock ): - interface = ElasticSearchInterface(["127.0.0.1"]) + interface = OpenSearchInterface(["127.0.0.1"]) interface._es.indices = MagicMock() interface._es.indices.exists = MagicMock(return_value=False) interface._es.indices.create = MagicMock() @@ -187,11 +187,11 @@ def test_elasticsearch_should_create_index_if_it_does_not_exists( timeout="30s", ) - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_elasticsearch_should_create_index_with_default_value_with_function_has_no_arguments( - self, elasticsearch_mock + @patch("opensearch.Elasticsearch", autospec=True) + def test_opensearch_should_create_index_with_default_value_with_function_has_no_arguments( + self, opensearch_mock ): - interface = ElasticSearchInterface( + interface = OpenSearchInterface( ["127.0.0.1"], default_index="querido-diario2" ) interface._es.indices = MagicMock() @@ -205,9 +205,9 @@ def test_elasticsearch_should_create_index_with_default_value_with_function_has_ timeout="30s", ) - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_upload_document_to_index(self, elasticsearch_mock): - interface = ElasticSearchInterface(["127.0.0.1"]) + @patch("opensearch.Elasticsearch", autospec=True) + def test_upload_document_to_index(self, opensearch_mock): + interface = OpenSearchInterface(["127.0.0.1"]) document_checksum = str(uuid.uuid1()) interface.index_document(self.fake_document, "querido-diario") interface._es.index.assert_called_once_with( @@ -216,9 +216,9 @@ def test_upload_document_to_index(self, elasticsearch_mock): id=self.fake_document["file_checksum"], ) - @patch("elasticsearch.Elasticsearch", autospec=True) - def test_upload_document_to_index_using_default_index(self, elasticsearch_mock): - interface = ElasticSearchInterface( + @patch("opensearch.Elasticsearch", autospec=True) + def test_upload_document_to_index_using_default_index(self, opensearch_mock): + interface = OpenSearchInterface( ["127.0.0.1"], default_index="querido-diario2" ) document_checksum = str(uuid.uuid1()) @@ -230,7 +230,7 @@ def test_upload_document_to_index_using_default_index(self, elasticsearch_mock): ) -class ElasticsearchIntegrationTests(TestCase): +class OpensearchIntegrationTests(TestCase): def setUp(self): document_checksum = str(uuid.uuid1()) self.fake_document = { @@ -249,7 +249,7 @@ def setUp(self): "state_code": "SC", "territory_name": "Gaspar", } - self._es = elasticsearch.Elasticsearch(hosts=["127.0.0.1"]) + self._es = opensearch.Opensearch(hosts=["127.0.0.1"]) def clean_index(self, index): self._es.delete_by_query( @@ -266,13 +266,13 @@ def delete_index(self, index): def test_index_creation(self): self.delete_index("querido-diario") - interface = ElasticSearchInterface(["127.0.0.1"], timeout="5m") + interface = OpenSearchInterface(["127.0.0.1"], timeout="5m") interface.create_index("querido-diario") self.assertTrue(self._es.indices.exists("querido-diario")) def test_index_document(self): self.clean_index("querido-diario") - interface = ElasticSearchInterface(["127.0.0.1"]) + interface = OpenSearchInterface(["127.0.0.1"]) interface.index_document(self.fake_document, "querido-diario") self._es.indices.refresh(index="querido-diario") @@ -292,7 +292,7 @@ def test_index_document(self): def test_index_document_twice(self): self.clean_index("querido-diario") - interface = ElasticSearchInterface(["127.0.0.1"]) + interface = OpenSearchInterface(["127.0.0.1"]) interface.index_document(self.fake_document, "querido-diario") interface.index_document(self.fake_document, "querido-diario") self._es.indices.refresh(index="querido-diario")