diff --git a/.github/workflows/linux-executable.yml b/.github/workflows/linux-executable.yml new file mode 100644 index 0000000..6c635ce --- /dev/null +++ b/.github/workflows/linux-executable.yml @@ -0,0 +1,64 @@ +name: Upload Release Artifact Executable Linux + +on: + push: + # Sequence of patterns matched against refs/tags + tags: + - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10 + +jobs: + build-binary-executable: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v1 + + - name: Cache maven deps + uses: actions/cache@v1 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-native-${{ hashFiles('**/deps.edn') }} + restore-keys: | + ${{ runner.os }}-maven-native- + + - name: setup-graalvm-ce + uses: rinx/setup-graalvm-ce@v0.0.1 + with: + graalvm-version: "20.1.0" + java-version: "java11" + + - name: setup-native-image + run: | + gu install native-image + + - name: Install clojure tools-deps + uses: DeLaGuardo/setup-clojure@master + with: + tools-deps: 1.10.1.469 + + - name: Compile native binary + run: | + clojure -A:native-ket + tar zcvf ket-linux.tar.gz ket + + - name: Create Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ github.ref }} + release_name: Release ${{ github.ref }} + draft: false + prerelease: false + - name: Upload Release Asset + id: upload-release-asset + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.create_release.outputs.upload_url }} # This pulls from the CREATE RELEASE step above, referencing it's ID to get its outputs object, which include a `upload_url`. See this blog post for more info: https://jasonet.co/posts/new-features-of-github-actions/#passing-data-to-future-steps + asset_path: ./ket-linux.tar.gz + asset_name: ket-linux.tar.gz + asset_content_type: application/octet-stream diff --git a/.github/workflows/macos-executable.yml b/.github/workflows/macos-executable.yml new file mode 100644 index 0000000..348b825 --- /dev/null +++ b/.github/workflows/macos-executable.yml @@ -0,0 +1,70 @@ +name: Upload Release Artifact Executable Macos + +on: + push: + # Sequence of patterns matched against refs/tags + tags: + - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10 + +jobs: + build-binary-executable: + + runs-on: macos-latest + + steps: + - uses: actions/checkout@v1 + + - name: Cache maven deps + uses: actions/cache@v1 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-native-${{ hashFiles('**/deps.edn') }} + restore-keys: | + ${{ runner.os }}-maven-native- + + - name: setup-graalvm-ce-native-image + run: | + GRAALVM_TGZ_URI="https://github.com/graalvm/graalvm-ce-builds/releases/download/vm-20.1.0/graalvm-ce-java11-darwin-amd64-20.1.0.tar.gz" + curl -sL $GRAALVM_TGZ_URI --output graalvm.tar.gz + tar -zxvf graalvm.tar.gz + ls -alh . + ./graalvm-ce-java11-20.1.0/Contents/Home/bin/gu install native-image + export GRAALVM_HOME=$(pwd)/graalvm-ce-java11-20.1.0/Contents/Home/ + export JAVA_HOME=${GRAALVM_HOME} + export PATH=${GRAALVM_HOME}/bin:$PATH + + - name: Install clojure tools-deps + uses: DeLaGuardo/setup-clojure@master + with: + tools-deps: 1.10.1.469 + + - name: Compile native binary + run: | + export GRAALVM_HOME=$(pwd)/graalvm-ce-java11-20.1.0/Contents/Home/ + echo $GRAALVM_HOME + export PATH=${GRAALVM_HOME}/bin:$PATH + clojure -A:native-ket + chmod +x ket + tar zcvf ket-macos.tar.gz ket + + - name: Create Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ github.ref }} + release_name: Release ${{ github.ref }} + draft: false + prerelease: false + - name: Upload Release Asset + id: upload-release-asset + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RELEASE_VERSION: $(echo $GITHUB_REF | cut -d / -f 3) + with: + upload_url: ${{ steps.create_release.outputs.upload_url }} # This pulls from the CREATE RELEASE step above, referencing it's ID to get its outputs object, which include a `upload_url`. See this blog post for more info: https://jasonet.co/posts/new-features-of-github-actions/#passing-data-to-future-steps + asset_path: ./ket-macos.tar.gz + asset_name: ket-macos.tar.gz + asset_content_type: application/octet-stream diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..3ac24f2 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,79 @@ +name: Tests + +on: [push] + +jobs: + unit-test: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v1 + + - name: Cache maven deps + uses: actions/cache@v1 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('**/deps.edn') }} + restore-keys: | + ${{ runner.os }}-maven- + - name: Prepare java + uses: actions/setup-java@v1 + with: + java-version: 1.11 + + - name: Install clojure tools-deps + uses: DeLaGuardo/setup-clojure@master + with: + tools-deps: 1.10.1.469 + + - name: Unit Tests + run: clojure -A:test -e integration + + integration-test: + name: Integration Tests + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + # Set N number of parallel jobs you want to run tests on. + # Use higher number if you have slow tests to split them on more parallel jobs. + # Remember to update ci_node_index below to 0..N-1 + ci_node_total: [2] + # set N-1 indexes for parallel jobs + # When you run 2 parallel jobs then first job will have index 0, the second job will have index 1 etc + ci_node_index: [0] + java: ["1.8", "11", "14"] + elasticsearch: ["elasticsearch:6.8.8", "elasticsearch:7.8.0"] + services: + elasticsearch: + image: ${{ matrix.elasticsearch }} + ports: + - 9200/tcp + options: -e="discovery.type=single-node" --health-cmd="curl http://localhost:9200/_cluster/health" --health-interval=10s --health-timeout=5s --health-retries=10 + steps: + - uses: actions/checkout@v1 + + - name: Cache maven deps + uses: actions/cache@v1 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('**/deps.edn') }} + restore-keys: | + ${{ runner.os }}-maven- + - name: Prepare java + uses: actions/setup-java@v1 + with: + java-version: ${{ matrix.java }} + + - name: Install clojure tools-deps + uses: DeLaGuardo/setup-clojure@master + with: + tools-deps: 1.10.1.469 + + - name: Integration Tests + env: + ES_HOST: http://localhost:${{ job.services.elasticsearch.ports[9200] }} + run: | + echo $ES_HOST + clojure -A:test -i integration -e kafka diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c40d72d --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +.idea/ +**/*.iml +.cpcache +classes +.nrepl-port +.env +es-tool +target/ +.clj-kondo +.lsp diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8ca223f --- /dev/null +++ b/Makefile @@ -0,0 +1,30 @@ +include dockerfiles/docker.mk + +.PHONY: lint +lint: + clojure -M:clj-kondo + +.PHONY: unit-test +unit-test: + clojure -M:test --exclude :integration + +.PHONY: integration-test +integration-test: + clojure -M:test --include :integration + +.PHONY: run-dev-env +run-dev-env: start-stack + +ES_TEST:=-p integration-tests -f dockerfiles/docker-compose.es.test.yml -f dockerfiles/docker-compose.kafka-base.yml +.PHONY: run-integration-tests +run-integration-tests: + docker-compose $(ES_TEST) pull + docker-compose $(ES_TEST) down + docker-compose $(ES_TEST) build + docker-compose $(ES_TEST) up --remove-orphans --abort-on-container-exit --exit-code-from tools-test + +build-ket: + docker build -f dockerfiles/Dockerfile.executable-builder -t ket-native-image . + docker rm ket-native-image-build || true + docker create --name ket-native-image-build ket-native-image + docker cp ket-native-image-build:/usr/src/app/ket ket diff --git a/README.md b/README.md new file mode 100644 index 0000000..d00a65b --- /dev/null +++ b/README.md @@ -0,0 +1,221 @@ +# KET + +Helper tools to work with Elasticsearch and Kafka. The tool is best used as a CLI. + +## Quick Start + +Start e.g. reindexing: +```shell script +./ket reindex -f examples/reindex-settings.json +``` + +See available options: +```shell script +$ ./ket -h + -o, --operation OPERATION A name of a supported operation. One of: [kafka-to-kafka, kafka-to-ndjson, kafka-to-elasticsearch, elasticsearch-to-elasticsearch, reindex, elasticsearch-to-kafka, elasticsearch-to-ndjson, krp-to-ndjson, replay, deep-replay, polyglot, server] + --defaults Print to STDOUT the default configuration of the operation + --docs Print to STDOUT the docstring of the operation + -f, --config-file CONFIG_FILE Path to the JSON file with operation config + -h, --help + +``` + +``` +$ ./ket reindex --docs +``` + +``` +$ ./ket reindex --defaults +``` + +## Native Executable + +Either compile for yourself (for linux): +```shell script +make build-ket +``` +Download binary for your architecture from [here](https://github.com/vinted/kafka-elasticsearch-tool/releases) + +## Supported operations + +- reindex +- profile slow queries +- replay slow queries with various profiles +- send data from one kafka topic to another (possibly between cluster) +- send data from a Kafka topic to Elasticsearch index +- send data from Elasticsearch to Kafka +- store data from Elasticsearch or Kafka as ndjson file +- polyglot transforms + +### Reindex + +```shell script +./ket reindex -f examples/reindex-settings.json +``` + +Reindex operation config file basic example: +```json +{ + "max_docs" : 1200, + "source" : { + "remote" : { + "host" : "http://localhost:9200" + }, + "index" : ".kibana" + }, + "dest" : { + "index" : "destination-index-name", + "remote" : { + "host" : "http://localhost:9200" + } + } +} +``` +Config file format is based on the [Elasticsearch Reindex API](https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html). + +`reindex` operation supports: +- reindexing between clusters; +- reindexing between clusters that are running different versions of Elasticsearch. +- when starting disables `refresh` interval and at the end enables it. +### Copy Kafka topic(s) data to another Kafka topic + +```shell script +./ket kafka-to-kafka -f examples/kafka-to-kafka.json +``` + +Example config: +```json +{ + "max_docs": 1, + "source": { + "topic": "source-topic", + "bootstrap.servers": "127.0.0.1:9092" + }, + "sink": { + "topic": "dest-topic", + "bootstrap.servers": "127.0.0.1:9092" + } +} +``` + +`source` and `sink` maps are for Kafka consumer and producer options respectively. All available options are supported. + +### Elasticsearch to Kafka + +```shell script +./ket elasticsearch-to-kafka -f examples/es-to-kafka.json +``` +Example configuration: +```json +{ + "max_docs": 10000, + "source": { + "remote": { + "connect_timeout": "10s", + "host": "http://localhost:9200", + "socket_timeout": "1m" + }, + "index": ".kibana", + "query": { + "sort": [ + "_doc" + ], + "size": 2000 + } + }, + "sink": { + "topic": "kibana-data", + "bootstrap.servers": "127.0.0.1:9092" + } +} + +``` +`source` is the same as in reindex, +`sink` is Kafka Producer option map. + +## Elasticsearch to ndjson + +```shell script +./ket elasticsearch-to-ndjson -f examples/es-to-ndjson.json +``` +Example configuration: +```json +{ + "max_docs": 10000, + "source": { + "remote": { + "host": "http://localhost:9200" + }, + "index": ".kibana", + "query": { + "size": 2000 + } + }, + "sink": { + "filename ": "es-docs.ndjson" + } +} +``` + +## Kafka to ndjson + +```shell script +./ket kafka-to-ndjson -f examples/kafka-to-ndjson.json +``` +Example configuration: +```json +{ + "max_docs": 10000, + "source": { + "bootstrap.servers": "127.0.0.1:9092", + "topic": "topic-name", + "impatient": true + }, + "sink": { + "filename ": "es-docs.ndjson" + } +} +``` + +## ndjson to Elasticsearch + +```shell script +curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/_bulk --data-binary @file.ndjson +``` + +## Polyglot transforms + +Try it for your self, e.g.: +```shell +clojure -M -m core polyglot --data='{"foo":"bar"}' --file="my-script.js" --lang=js | jq '.result | fromjson' +{ + "foo": "bar", + "a": 123 +} +``` +as seen in the example, the script can be stored in a file. + +Supported languages are `['js' 'sci']`. + +## Logging + +Logging is controlled by the [logback](http://logback.qos.ch/) library. The output layout is JSON (you can query it with `jq` or collect logs with logstash or beats). +Default logging level is `INFO`. +When executed as a binary, i.e. `./ket OPERATION`, then logging levels are controlled by an environment variable called: `ROOT_LOGGER_LEVEL`, e.g. `ROOT_LOGGER_LEVEL=WARN ./ket operation -f config.json` +Acceptable values of the `ROOT_LOGGER_LEVEL` are: `TRACE`, `DEBUG`, `INFO`, `WARN`, `ERROR`, `FATAL`. +When an unknown value is provided, e.g. `ROOT_LOGGER_LEVEL=foo ./ket operation -f config.json`, then logback defaults to `DEBUG` logging level. + +## Supported Elasticsearch Versions + +- 7.x.y + +## Development + +Development requires [GraalVM 20.3.0+](https://github.com/graalvm/graalvm-ce-builds/releases/tag/vm-20.3.0), +Docker and Docker Compose, GNU Make, and [Clojure CLI tools](https://clojure.org/guides/getting_started). + +## License + +Copyright © 2021 [Vinted](https://www.vinted.engineering). + +Distributed under BSD 3-Clause License diff --git a/deps.edn b/deps.edn new file mode 100644 index 0000000..a0a8b50 --- /dev/null +++ b/deps.edn @@ -0,0 +1,71 @@ +{:paths + ["src" "resources"] + :mvn/repos + {"confluence" {:url "http://packages.confluent.io/maven/"}} + :deps + {org.clojure/clojure {:mvn/version "1.10.2-rc3"} + borkdude/sci {:mvn/version "0.1.1-alpha.10"} + borkdude/sci.impl.reflector {:mvn/version "0.0.1-java11"} + lt.jocas/lazy-elasticsearch-scroll {:mvn/version "1.0.16"} + http-kit/http-kit {:mvn/version "2.5.0"} + metosin/reitit {:mvn/version "0.5.10"} + io.confluent/kafka-connect-elasticsearch {:mvn/version "10.0.0"} + org.apache.kafka/connect-api {:mvn/version "2.6.0"} + org.apache.kafka/connect-json {:mvn/version "2.6.0"} + org.clojure/tools.logging {:mvn/version "1.1.0"} + org.clojure/tools.cli {:mvn/version "1.0.194"} + org.clojure/core.async {:mvn/version "1.3.610"} + ch.qos.logback/logback-core {:mvn/version "1.2.3"} + ch.qos.logback/logback-classic {:mvn/version "1.2.3"} + ch.qos.logback.contrib/logback-json-classic {:mvn/version "0.1.5"} + ch.qos.logback.contrib/logback-jackson {:mvn/version "0.1.5"} + io.quarkus/quarkus-kafka-client {:mvn/version "1.9.2.Final" + :exclusions [org.jboss.slf4j/slf4j-jboss-logging + org.jboss.logging/jboss-logging]}} + :aliases + {:dev + {:extra-paths ["dev" "classes" "test" "test/resources"] + :extra-deps {org.clojure/tools.deps.alpha {:git/url "https://github.com/clojure/tools.deps.alpha.git" + :sha "f6c080bd0049211021ea59e516d1785b08302515" + :exclusions [org.slf4j/slf4j-log4j12 + org.slf4j/slf4j-api + org.slf4j/slf4j-nop]} + criterium/criterium {:mvn/version "0.4.6"}}} + :test + {:extra-paths ["test" "test/resources"] + :extra-deps {com.cognitect/test-runner {:git/url "https://github.com/cognitect-labs/test-runner.git" + :sha "028a6d41ac9ac5d5c405dfc38e4da6b4cc1255d5"}} + :main-opts ["-m" "cognitect.test-runner"]} + :clj-kondo + {:main-opts ["-m" "clj-kondo.main --lint src test"] + :extra-deps {clj-kondo/clj-kondo {:mvn/version "2020.11.07"}} + :jvm-opts ["-Dclojure.main.report=stderr"]} + :native-ket + {:main-opts ["-m clj.native-image core" + "--static" + "--enable-https" + "--no-fallback" + "--language:js" + "--allow-incomplete-classpath" + "--initialize-at-build-time" + "--enable-all-security-services" + "--initialize-at-run-time=org.httpkit.client.HttpClient" + "--initialize-at-run-time=org.httpkit.client.SslContextFactory" + "--report-unsupported-elements-at-runtime" + "-J-Dclojure.compiler.direct-linking=true" + "-H:ReflectionConfigurationFiles=graalvm/reflect-config.json" + ;; optional native image name override + "-H:+ReportExceptionStackTraces" + "-H:IncludeResources=logback.xml" + "-H:Name=ket"] + :jvm-opts ["-Dclojure.compiler.direct-linking=true"] + :extra-deps {org.jboss.logging/commons-logging-jboss-logging + {:mvn/version "1.0.0.Final"} + borkdude/clj-reflector-graal-java11-fix + {:mvn/version "0.0.1-graalvm-20.3.0" + :exclusions [org.graalvm.nativeimage/svm]} + clj.native-image/clj.native-image + {:git/url "https://github.com/taylorwood/clj.native-image.git" + :exclusions [commons-logging/commons-logging + org.slf4j/slf4j-nop] + :sha "7708e7fd4572459c81f6a6b8e44c96f41cdd92d4"}}}}} diff --git a/dev/user.clj b/dev/user.clj new file mode 100644 index 0000000..ca9ada7 --- /dev/null +++ b/dev/user.clj @@ -0,0 +1,3 @@ +(ns user) + +(set! *warn-on-reflection* true) diff --git a/dockerfiles/Dockerfile.executable-builder b/dockerfiles/Dockerfile.executable-builder new file mode 100644 index 0000000..e8e840a --- /dev/null +++ b/dockerfiles/Dockerfile.executable-builder @@ -0,0 +1,23 @@ +FROM oracle/graalvm-ce:20.3.0-java11 as BUILDER + +ENV GRAALVM_HOME=$JAVA_HOME + +RUN gu install native-image \ + && gu install ruby \ + && gu install python \ + && gu install r \ + && curl -O https://download.clojure.org/install/linux-install-1.10.1.727.sh \ + && chmod +x linux-install-1.10.1.727.sh \ + && ./linux-install-1.10.1.727.sh \ + && rm linux-install-1.10.1.727.sh + +RUN mkdir -p /usr/src/app +WORKDIR /usr/src/app + +COPY deps.edn /usr/src/app/ +RUN clojure -P -M:native-ket +COPY resources/ /usr/src/app/resources +COPY graalvm/ /usr/src/app/graalvm/ +COPY src/ /usr/src/app/src + +RUN clojure -M:native-ket diff --git a/dockerfiles/Dockerfile.test b/dockerfiles/Dockerfile.test new file mode 100644 index 0000000..2dce149 --- /dev/null +++ b/dockerfiles/Dockerfile.test @@ -0,0 +1,21 @@ +FROM oracle/graalvm-ce:20.3.0-java11 as BUILDER + +ENV GRAALVM_HOME=$JAVA_HOME + +RUN gu install native-image \ + && gu install ruby \ + && gu install python \ + && gu install r \ + && curl -O https://download.clojure.org/install/linux-install-1.10.1.727.sh \ + && chmod +x linux-install-1.10.1.727.sh \ + && ./linux-install-1.10.1.727.sh \ + && rm linux-install-1.10.1.727.sh + +RUN mkdir /root/.gitlibs + +WORKDIR /usr/src/app +COPY deps.edn /usr/src/app/ + +RUN clojure -P -M:test + +COPY . /usr/src/app diff --git a/dockerfiles/docker-compose.es.test.yml b/dockerfiles/docker-compose.es.test.yml new file mode 100644 index 0000000..c4157bb --- /dev/null +++ b/dockerfiles/docker-compose.es.test.yml @@ -0,0 +1,27 @@ +version: '3' +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:${ES_VERSION:-7.10.0} + environment: + - discovery.type=single-node + - bootstrap.memory_lock=true + - logger.level=WARN + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + command: > + bash -c "elasticsearch-plugin install analysis-stempel; + docker-entrypoint.sh eswrapper" + ulimits: + memlock: + soft: -1 + hard: -1 + + tools-test: + depends_on: + - elasticsearch + build: + context: ../ + dockerfile: dockerfiles/Dockerfile.test + environment: + ES_HOST: http://elasticsearch:9200 + KAFKA_BOOTSTRAP_SERVERS: http://broker:29092 + command: ["clojure", "-M:test", "--include", "integration"] diff --git a/dockerfiles/docker-compose.es.yml b/dockerfiles/docker-compose.es.yml new file mode 100644 index 0000000..c864439 --- /dev/null +++ b/dockerfiles/docker-compose.es.yml @@ -0,0 +1,27 @@ +version: '3' +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:${ES_VERSION:-7.10.0} + environment: + - discovery.type=single-node + - bootstrap.memory_lock=true + - logger.level=WARN + - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m" + ulimits: + memlock: + soft: -1 + hard: -1 + command: > + bash -c "elasticsearch-plugin install analysis-stempel; + docker-entrypoint.sh eswrapper" + ports: + - 9200:9200 + + kibana: + image: docker.elastic.co/kibana/kibana:${ES_VERSION:-7.10.0} + environment: + SERVER_NAME: kibana + LOGGING_QUIET: 'true' + ELASTICSEARCH_HOSTS: http://elasticsearch:9200 + ports: + - 5601:5601 diff --git a/dockerfiles/docker-compose.kafka-base.yml b/dockerfiles/docker-compose.kafka-base.yml new file mode 100644 index 0000000..7801f86 --- /dev/null +++ b/dockerfiles/docker-compose.kafka-base.yml @@ -0,0 +1,48 @@ +version: '3' + +services: + zookeeper: + image: confluentinc/cp-zookeeper:5.5.1 + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + ZOOKEEPER_TICK_TIME: 2000 + ZOOKEEPER_LOG4J_ROOT_LOGLEVEL: ERROR + + broker: + image: confluentinc/cp-server:5.5.1 + depends_on: + - zookeeper + environment: + KAFKA_ADVERTISED_HOST_NAME: localhost + KAFKA_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://:9092 + KAFKA_BROKER_ID: 1 + KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092 + KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 + KAFKA_CONFLUENT_LICENSE_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 + KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 + CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: broker:29092 + CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181 + CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 + CONFLUENT_METRICS_ENABLE: 'true' + CONFLUENT_SUPPORT_CUSTOMER_ID: 'anonymous' + KAFKA_LOG4J_LOGGERS: org=ERROR,io=ERROR,kafka.controller=ERROR,kafka=ERROR,state.change.logger=ERROR,kafka.log.LogCleaner=WARN + KAFKA_LOG4J_ROOT_LOGLEVEL: ERROR + volumes: + - /var/run/docker.sock:/var/run/docker.sock + + restproxy: + image: confluentinc/cp-kafka-rest:5.5.1 + depends_on: + - zookeeper + - broker + environment: + KAFKA_REST_HOST_NAME: restproxy + KAFKA_REST_BOOTSTRAP_SERVERS: 'broker:29092' + KAFKA_REST_ZOOKEEPER_CONNECT: 'zookeeper:2181' + KAFKA_REST_HEAP_OPTS: "-Xms1024m -Xmx1024m" + KAFKA_REST_LOG4J_ROOT_LOGLEVEL: ERROR diff --git a/dockerfiles/docker-compose.kafka.yml b/dockerfiles/docker-compose.kafka.yml new file mode 100644 index 0000000..80bcfd8 --- /dev/null +++ b/dockerfiles/docker-compose.kafka.yml @@ -0,0 +1,30 @@ +version: '3' + +services: + zookeeper: + ports: ["2181:2181"] + + broker: + ports: ["9092:9092"] + + restproxy: + ports: ["8082:8082"] + + control-center: + image: confluentinc/cp-enterprise-control-center:5.5.1 + hostname: control-center + depends_on: + - zookeeper + - broker + ports: + - "9021:9021" + environment: + CONTROL_CENTER_LOG4J_ROOT_LOGLEVEL: 'ERROR' + CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092' + CONTROL_CENTER_ZOOKEEPER_CONNECT: 'zookeeper:2181' + CONTROL_CENTER_CONNECT_CLUSTER: 'connect:8088' + CONTROL_CENTER_REPLICATION_FACTOR: 1 + CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1 + CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1 + CONFLUENT_METRICS_TOPIC_REPLICATION: 1 + PORT: 9021 diff --git a/dockerfiles/docker.mk b/dockerfiles/docker.mk new file mode 100644 index 0000000..778da13 --- /dev/null +++ b/dockerfiles/docker.mk @@ -0,0 +1,8 @@ +.PHONY: start-stack +start-stack: + docker-compose \ + -p es-kibana-kafka \ + -f dockerfiles/docker-compose.es.yml \ + -f dockerfiles/docker-compose.kafka-base.yml \ + -f dockerfiles/docker-compose.kafka.yml \ + up --remove-orphans diff --git a/examples/combined-text.json b/examples/combined-text.json new file mode 100644 index 0000000..7058338 --- /dev/null +++ b/examples/combined-text.json @@ -0,0 +1,43 @@ +{ + "max_docs": 10000, + "source": { + "remote": { + "host": "http://localhost:9200", + "socket_timeout": "5m" + }, + "index": "logstash-elastic7-slow-query-2020.05.12", + "query": { + "query": { + "bool": { + "must": [ + { + "match": { + "source": "multi_match" + } + } + ] + } + } + } + }, + "target-es-host": "http://localhost:9200", + "concurrency": 100, + "times-to-repeat": 5, + "dest": { + "index": "compare_combined_text", + "remote": { + "host": "http://localhost:9200" + } + }, + "sink": { + "batch.size" : 100, + "max.buffered.records" : 200, + "max.in.flight.requests" : 2, + "flush.timeout.ms" : 60000, + "retry.backoff.ms" : 900, + "max.retries" : 5, + "linger.ms" : 10000, + "read.timeout.ms" : 60000, + "connection.timeout.ms" : 60000 + } +} diff --git a/examples/deep-replay.json b/examples/deep-replay.json new file mode 100644 index 0000000..5ee25c8 --- /dev/null +++ b/examples/deep-replay.json @@ -0,0 +1,36 @@ +{ + "max_docs": 1, + "source": { + "remote": { + "host": "http://localhost:9200" + }, + "index": "query_logs", + "query": { + "query": { + "bool": { + "match_all": {} + } + } + } + }, + "replay": { + "description": "deep replay description", + "uri_attr": "uri", + "target-index": "target-index-name", + "repeats": 1, + "concurrency": 10, + "uri-transforms": [], + "query-transforms": [], + "id": "my-replay-id", + "depth": 10000000, + "connection.url": "http://localhost:9200", + "replay_data_attr": "replay", + "query_attr": "request" + }, + "sink": { + "implementation": "kafka", + "topic": "sink_topic_name", + "bootstrap.servers": "localhost:9092", + "linger.ms": 1000 + } +} diff --git a/examples/es-to-kafka.json b/examples/es-to-kafka.json new file mode 100644 index 0000000..877dcc0 --- /dev/null +++ b/examples/es-to-kafka.json @@ -0,0 +1,22 @@ +{ + "max_docs": 10000, + "source": { + "remote": { + "connect_timeout": "10s", + "host": "http://localhost:9200", + "socket_timeout": "1m" + }, + "index": ".kibana", + "query": { + "sort": [ + "_doc" + ], + "size": 2000 + }, + "keywordize?": false + }, + "sink": { + "topic": "kibana-data", + "bootstrap.servers": "127.0.0.1:9092" + } +} diff --git a/examples/es-to-ndjson.json b/examples/es-to-ndjson.json new file mode 100644 index 0000000..0c06ada --- /dev/null +++ b/examples/es-to-ndjson.json @@ -0,0 +1,15 @@ +{ + "max_docs": 10000, + "source": { + "remote": { + "host": "http://localhost:9200" + }, + "index": ".kibana", + "query": { + "size": 2000 + } + }, + "sink": { + "filename ": "es-docs.ndjson" + } +} diff --git a/examples/kafka-to-es.json b/examples/kafka-to-es.json new file mode 100644 index 0000000..53e16ba --- /dev/null +++ b/examples/kafka-to-es.json @@ -0,0 +1,14 @@ +{ + "max_docs": 1, + "source": { + "topic": "source-topic", + "bootstrap.servers": "127.0.0.1:9092" + }, + "dest": { + "index": "dest-index-name", + "remote": { + "host": "http://localhost:9200" + } + }, + "sink": {} +} diff --git a/examples/kafka-to-kafka.json b/examples/kafka-to-kafka.json new file mode 100644 index 0000000..af79b05 --- /dev/null +++ b/examples/kafka-to-kafka.json @@ -0,0 +1,11 @@ +{ + "max_docs": 1, + "source": { + "topic": "source-topic", + "bootstrap.servers": "127.0.0.1:9092" + }, + "sink": { + "topic": "dest-topic", + "bootstrap.servers": "127.0.0.1:9092" + } +} diff --git a/examples/kafka-to-ndjson.json b/examples/kafka-to-ndjson.json new file mode 100644 index 0000000..0ac0adb --- /dev/null +++ b/examples/kafka-to-ndjson.json @@ -0,0 +1,10 @@ +{ + "max_docs": 1, + "source": { + "topic": "source-topic", + "bootstrap.servers": "127.0.0.1:9092" + }, + "sink": { + "filename ": "es-docs.ndjson" + } +} diff --git a/examples/krp-to-ndjson.json b/examples/krp-to-ndjson.json new file mode 100644 index 0000000..3fbe787 --- /dev/null +++ b/examples/krp-to-ndjson.json @@ -0,0 +1,18 @@ +{ + "max_docs": 100, + "source": { + "connection.url": "http://localhost:8082", + "group.id": "krp-group1_instance", + "consumer.name": "krp_instance", + "topic": "topic-name", + "timeout": 10000, + "offset": 0, + "concurrency": 20, + "consumer.request.timeout.ms": 5000, + "delete.consumer.instance": false + }, + "sink": { + "filename": "ndjson/deep-replay.ndjson", + "partition-size": 1000 + } +} diff --git a/examples/profile-slow-queries.json b/examples/profile-slow-queries.json new file mode 100644 index 0000000..e0f66e3 --- /dev/null +++ b/examples/profile-slow-queries.json @@ -0,0 +1,26 @@ +{ + "max_docs" : 10, + "source" : { + "remote" : { + "host" : "http://localhost:9200" + }, + "index" : "logstash-elastic6-slow-query-2020.02.25", + "query": { + "query": { + "term": { + "index.keyword": { + "value": "value" + } + } + } + } + }, + "target-es-host": "http://localhost:9200", + "concurrency": 100, + "dest" : { + "index" : "slow-logs-6-replay", + "remote" : { + "host" : "http://localhost:9200" + } + } +} diff --git a/examples/reindex-demo.json b/examples/reindex-demo.json new file mode 100644 index 0000000..040a69f --- /dev/null +++ b/examples/reindex-demo.json @@ -0,0 +1,25 @@ +{ + "max_docs": 20000000, + "source": { + "remote": { + "connect_timeout": "600s", + "host": "http://localhost:9200" + }, + "index": ".kibana" + }, + "dest": { + "index": "kibana_backup", + "remote": { + "host": "http://localhost:9200" + } + }, + "sink": { + "batch.size": 100, + "max.in.flight.requests": 32, + "max.buffered.records": 10000, + "retry.backoff.ms": 500, + "flush.timeout.ms": 60000, + "connection.timeout.ms": 60000, + "read.timeout.ms": 120000 + } +} diff --git a/examples/reindex-settings.json b/examples/reindex-settings.json new file mode 100644 index 0000000..7fb98d1 --- /dev/null +++ b/examples/reindex-settings.json @@ -0,0 +1,15 @@ +{ + "max_docs" : 1200, + "source" : { + "remote" : { + "host" : "http://localhost:9200" + }, + "index" : ".kibana" + }, + "dest" : { + "index" : "destination-index-name", + "remote" : { + "host" : "http://localhost:9200" + } + } +} diff --git a/examples/replay-for-impact.json b/examples/replay-for-impact.json new file mode 100644 index 0000000..09aed06 --- /dev/null +++ b/examples/replay-for-impact.json @@ -0,0 +1,86 @@ +{ + "max_docs": 100, + "source": { + "remote": { + "host": "http://localhost:9200" + }, + "index": "query_logs", + "query": { + "query": { + "bool": { + "filter": [ + { + "term": { + "query_from": { + "value": 0 + } + } + }, + { + "term": { + "stats": { + "value": "regular_items_lookup" + } + } + }, + { + "range": { + "header.timestamp": { + "gte": "now-2d" + } + } + }, + { + "match": { + "request": "multi_match" + } + }, + { + "prefix": { + "uri.keyword": "/index_name/_search" + } + } + ], + "must_not": [ + { + "exists": { + "field": "query_sort" + } + } + ] + } + }, + "sort": [ + { + "header.timestamp": { + "order": "asc" + } + } + ], + "docvalue_fields": [ + "uri.index" + ], + "size": 1 + } + }, + "replay": { + "connection.url": "http://localhost:9200", + "concurrency": 10, + "top-k": 100, + "query-transforms": [ + { + "id": "test2", + "lang": "sci", + "script": "(fn [query boost] query)", + "vals": [ + 123 + ] + } + ] + }, + "sink": { + "connection.url": "http://localhost:9200", + "dest.index": "impact_sink_index", + "batch.size": 50 + } +} \ No newline at end of file diff --git a/examples/replay.json b/examples/replay.json new file mode 100644 index 0000000..ee72bc1 --- /dev/null +++ b/examples/replay.json @@ -0,0 +1,50 @@ +{ + "max_docs" : 10, + "source" : { + "remote" : { + "host" : "http://localhost:9200" + }, + "index" : "source-index-name", + "query": { + "query": { + "term": { + "title": { + "value": "interesting" + } + } + } + } + }, + "replay" : { + "description" : "Sample query replay description", + "uri_attr" : "uri", + "repeats" : 1, + "concurrency" : 1, + "uri-transforms" : [ + { + "match":"_count\\?", + "replacement":"_search?size=0&" + } + ], + "query-transforms" : [ + { + "lang" : "js", + "script" : "(q) => Object.assign(q, {'_source': true})" + }, + { + "lang": "sci", + "script": "(fn [q] (assoc q :_explain true))" + } + ], + "id" : "my-replay-id", + "connection.url" : "http://localhost:9200", + "replay_data_attr" : "replay", + "query_attr" : "request" + }, + "sink" : { + "index" : "dest-index-name", + "remote" : { + "host" : "http://localhost:9200" + } + } +} diff --git a/graalvm/jni-config.json b/graalvm/jni-config.json new file mode 100644 index 0000000..0d4f101 --- /dev/null +++ b/graalvm/jni-config.json @@ -0,0 +1,2 @@ +[ +] diff --git a/graalvm/proxy-config.json b/graalvm/proxy-config.json new file mode 100644 index 0000000..0d4f101 --- /dev/null +++ b/graalvm/proxy-config.json @@ -0,0 +1,2 @@ +[ +] diff --git a/graalvm/reflect-config.json b/graalvm/reflect-config.json new file mode 100644 index 0000000..69c10af --- /dev/null +++ b/graalvm/reflect-config.json @@ -0,0 +1,56 @@ +[ + { + "name":"org.apache.kafka.clients.consumer.ConsumerConfig", + "allPublicFields":true + }, + { + "name":"org.apache.kafka.clients.consumer.KafkaConsumer", + "allPublicMethods":true, + "allPublicConstructors":true + }, + { + "name":"org.apache.kafka.clients.consumer.RangeAssignor", + "methods":[{"name":"","parameterTypes":[] }] + }, + { + "name":"org.apache.kafka.clients.producer.KafkaProducer", + "allPublicMethods":true, + "allPublicConstructors":true + }, + { + "name":"org.apache.kafka.clients.producer.ProducerConfig", + "allPublicFields":true + }, + { + "name":"org.apache.kafka.clients.producer.ProducerRecord", + "allPublicConstructors":true + }, + { + "name":"org.apache.kafka.clients.producer.internals.DefaultPartitioner", + "methods":[{"name":"","parameterTypes":[] }] + }, + { + "name":"org.apache.kafka.common.serialization.LongDeserializer", + "methods":[{"name":"","parameterTypes":[] }] + }, + { + "name":"org.apache.kafka.common.serialization.LongSerializer", + "methods":[{"name":"","parameterTypes":[] }] + }, + { + "name":"org.apache.kafka.common.serialization.StringDeserializer", + "methods":[{"name":"","parameterTypes":[] }] + }, + { + "name":"org.apache.kafka.common.serialization.StringSerializer", + "methods":[{"name":"","parameterTypes":[] }] + }, + { + "name":"org.apache.kafka.connect.data.Schema", + "allPublicFields":true + }, + { + "name":"org.apache.kafka.connect.sink.SinkRecord", + "allPublicConstructors":true + } +] diff --git a/graalvm/resource-config.json b/graalvm/resource-config.json new file mode 100644 index 0000000..5f12357 --- /dev/null +++ b/graalvm/resource-config.json @@ -0,0 +1,86 @@ +{ + "resources":[ + {"pattern":"\\Qclojure/core.clj\\E"}, + {"pattern":"\\Qclojure/core/async.clj\\E"}, + {"pattern":"\\Qclojure/core/async/impl/buffers.clj\\E"}, + {"pattern":"\\Qclojure/core/async/impl/channels.clj\\E"}, + {"pattern":"\\Qclojure/core/async/impl/concurrent.clj\\E"}, + {"pattern":"\\Qclojure/core/async/impl/dispatch.clj\\E"}, + {"pattern":"\\Qclojure/core/async/impl/exec/threadpool.clj\\E"}, + {"pattern":"\\Qclojure/core/async/impl/ioc_macros.clj\\E"}, + {"pattern":"\\Qclojure/core/async/impl/mutex.clj\\E"}, + {"pattern":"\\Qclojure/core/async/impl/protocols.clj\\E"}, + {"pattern":"\\Qclojure/core/async/impl/timers.clj\\E"}, + {"pattern":"\\Qclojure/core/cache.clj\\E"}, + {"pattern":"\\Qclojure/core/memoize.clj\\E"}, + {"pattern":"\\Qclojure/core/server.clj\\E"}, + {"pattern":"\\Qclojure/core/server__init.class\\E"}, + {"pattern":"\\Qclojure/core/specs/alpha.clj\\E"}, + {"pattern":"\\Qclojure/core__init.class\\E"}, + {"pattern":"\\Qclojure/data/priority_map.clj\\E"}, + {"pattern":"\\Qclojure/pprint.clj\\E"}, + {"pattern":"\\Qclojure/pprint__init.class\\E"}, + {"pattern":"\\Qclojure/reflect.clj\\E"}, + {"pattern":"\\Qclojure/reflect__init.class\\E"}, + {"pattern":"\\Qclojure/spec/alpha.clj\\E"}, + {"pattern":"\\Qclojure/spec/alpha__init.class\\E"}, + {"pattern":"\\Qclojure/tools/analyzer.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/ast.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/env.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/jvm.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/jvm/utils.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/add_binding_atom.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/cleanup.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/constant_lifter.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/elide_meta.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/emit_form.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/analyze_host_expr.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/annotate_host_info.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/annotate_loops.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/annotate_tag.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/box.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/classify_invoke.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/constant_lifter.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/emit_form.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/fix_case_test.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/infer_tag.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/validate.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/validate_loop_locals.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/validate_recur.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/jvm/warn_on_reflection.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/source_info.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/trim.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/uniquify.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/passes/warn_earmuff.clj\\E"}, + {"pattern":"\\Qclojure/tools/analyzer/utils.clj\\E"}, + {"pattern":"\\Qclojure/tools/cli.cljc\\E"}, + {"pattern":"\\Qclojure/tools/logging.clj\\E"}, + {"pattern":"\\Qclojure/tools/logging/impl.clj\\E"}, + {"pattern":"\\Qclojure/tools/reader.clj\\E"}, + {"pattern":"\\Qclojure/tools/reader/default_data_readers.clj\\E"}, + {"pattern":"\\Qclojure/tools/reader/impl/commons.clj\\E"}, + {"pattern":"\\Qclojure/tools/reader/impl/errors.clj\\E"}, + {"pattern":"\\Qclojure/tools/reader/impl/inspect.clj\\E"}, + {"pattern":"\\Qclojure/tools/reader/impl/utils.clj\\E"}, + {"pattern":"\\Qclojure/tools/reader/reader_types.clj\\E"}, + {"pattern":"\\Qcore.clj\\E"}, + {"pattern":"\\Qcore/async.clj\\E"}, + {"pattern":"\\Qcore/cartesian.clj\\E"}, + {"pattern":"\\Qcore/http.clj\\E"}, + {"pattern":"\\Qcore/ilm.clj\\E"}, + {"pattern":"\\Qcore/index.clj\\E"}, + {"pattern":"\\Qcore/json.clj\\E"}, + {"pattern":"\\Qcore/records.clj\\E"}, + {"pattern":"\\Qjsonista/core.clj\\E"}, + {"pattern":"\\Qkafka/kafka-version.properties\\E"}, + {"pattern":"\\Qorg/httpkit/client.clj\\E"}, + {"pattern":"\\Qorg/httpkit/encode.clj\\E"}, + {"pattern":"\\Qorg/slf4j/impl/StaticLoggerBinder.class\\E"}, + {"pattern":"\\Qprofile_slow_queries.clj\\E"}, + {"pattern":"\\Qreindex.clj\\E"}, + {"pattern":"\\Qreplay.clj\\E"}, + {"pattern":"\\Qscroll.clj\\E"} + ], + "bundles":[] +} diff --git a/replay-problem.json b/replay-problem.json new file mode 100644 index 0000000..29f1178 --- /dev/null +++ b/replay-problem.json @@ -0,0 +1,61 @@ +{ + "max_docs": 10, + "source": { + "remote": { + "host": "http://localhost:9200" + }, + "index": "query_logs", + "query": { + "query": { + "bool": { + "filter": [ + { + "term": { + "uri.index": "index_name" + } + }, + { + "match_phrase": { + "request": "created_at lte" + } + }, + { + "match_phrase": { + "request": "user_updated_at lte" + } + } + ] + } + }, + "sort": [ + { + "header.timestamp": { + "order": "desc" + } + } + ] + } + }, + "replay": { + "description": "Sample query replay description", + "uri_attr": "uri", + "repeats": 1, + "concurrency": 1, + "query-transforms": [ + { + "lang": "js", + "script": "(request) => { return request; }" + } + ], + "id": "my-replay-id-0", + "connection.url": "http://localhost:9200", + "replay_data_attr": "replay", + "query_attr": "request" + }, + "dest": { + "index": "experiment", + "remote": { + "host": "http://localhost:9200" + } + } +} diff --git a/resources/logback.xml b/resources/logback.xml new file mode 100644 index 0000000..08554ac --- /dev/null +++ b/resources/logback.xml @@ -0,0 +1,17 @@ + + + + + + false + + true + yyyy-MM-dd'T'HH:mm:ss.SSS'Z' + + + + + + + diff --git a/src/cli.clj b/src/cli.clj new file mode 100644 index 0000000..eb28756 --- /dev/null +++ b/src/cli.clj @@ -0,0 +1,47 @@ +(ns cli + (:require [clojure.string :as str] + [clojure.tools.cli :as clojure.cli] + [cli.operation :as operation])) + +(defn op-names [operations] + (str "[" + (str/join ", " (map (fn [op] (str (:name op))) operations)) + "]")) + +(defn cli-options [operations] + [["-o" "--operation OPERATION" (format "A name of a supported operation. One of: %s" + (op-names operations)) + :parse-fn #(keyword %)] + [nil "--defaults" "Print to STDOUT the default configuration of the operation" + :default nil] + [nil "--docs" "Print to STDOUT the docstring of the operation" + :default nil] + ["-f" "--config-file CONFIG_FILE" "Path to the JSON file with operation config"] + ["-h" "--help"]]) + +(defn find-operation [operation-name operations] + (first (filter (fn [op] (= (name operation-name) (:name op))) operations))) + +(defn remove-nil-vals [m] (into {} (filter second m))) + +(defn recursive-parse + [args commands] + (let [{:keys [arguments] :as o} (clojure.cli/parse-opts args (cli-options commands) + :in-order true)] + (update (if (seq arguments) + (let [[subcommand & args] arguments + command (find-operation subcommand commands)] + (if command + (assoc o :operation {:name (keyword subcommand) + :conf (-> (operation/parse-opts args (:defaults command) subcommand) + (update :options remove-nil-vals))}) + (assoc o :operation {:errors [(format "Subcommand '%s' does not exist" subcommand)]}))) + o) + :options remove-nil-vals))) + +(comment + (cli/recursive-parse ["server" "--port" "8080"] core/cli-operations) + + (cli/recursive-parse ["replay" "sink" "--help"] core/cli-operations) + + (cli/recursive-parse ["reindex" "source" "remote" "--host=foo"] core/cli-operations)) diff --git a/src/cli/operation.clj b/src/cli/operation.clj new file mode 100644 index 0000000..ed49ba7 --- /dev/null +++ b/src/cli/operation.clj @@ -0,0 +1,81 @@ +(ns cli.operation + (:require [clojure.string :as str] + [clojure.tools.cli :as tools-cli] + [server :as server])) + +(defn non-scalar? [value] (map? value)) + +(defn value->parse-fn [value] + (cond + (number? value) [:parse-fn #(Integer/parseInt %)] + (boolean? value) [:parse-fn #(Boolean/parseBoolean %)] + (list? value) [:parse-fn #(str/split % #",")] + (vector? value) [:parse-fn #(str/split % #",")] + (map? value) [])) + +(defn value->default [value] + (cond + (map? value) [] + (list? value) [] + (vector? value) [] + :else [:default value])) + +(defn remove-non-scalars [defaults] + (into {} (remove (fn [[_ v]] (non-scalar? v)) defaults))) + +(defn defaults->cli-opts [defaults] + (mapv (fn [[k v]] + (concat + [nil + (str "--" (name k) "=" (name k)) + ""] + (value->parse-fn v))) + (remove-non-scalars defaults))) + +(comment + (defaults->cli-opts {:sink {:remote {:host "fooo"}}})) + +(defn operation-opts [defaults] + (concat + (defaults->cli-opts defaults) + [[nil "--defaults" "Print to STDOUT the default configuration of the operation"] + [nil "--docs" "Print to STDOUT the docstring of the operation"] + ["-f" "--config-file CONFIG_FILE" "Path to the JSON file with operation config"] + ["-h" "--help"]])) + +(defn remove-nil-vals [m] (into {} (filter second m))) + +(defn parse-opts [args defaults operation-name] + (let [{:keys [arguments] :as parsed} (tools-cli/parse-opts args (operation-opts defaults) + :in-order true)] + (loop [operation-args arguments + parsed-opts parsed] + (let [param-name (keyword (first operation-args)) + subdefaults (get defaults param-name)] + (if (and (seq operation-args) (some? subdefaults)) + (let [cli-opts (operation-opts subdefaults) + {subcommand-options :options + subcommand-summary :summary + subcommand-errors :errors + unprocessed-args :arguments} (tools-cli/parse-opts (rest operation-args) + cli-opts + :in-order true)] + (recur unprocessed-args + (-> parsed-opts + (update :options assoc param-name (remove-nil-vals subcommand-options)) + (update :options (fn [options] (if (:help subcommand-options) + (assoc options :help true) + options))) + (update :summary (fn [summary] (format "%s\n%s:\n%s" summary (name param-name) + subcommand-summary))) + (update :errors concat subcommand-errors)))) + (if (or (nil? param-name) (some? subdefaults)) + parsed-opts + (update parsed-opts :errors conj + (format "Configuration param '%s' for operation '%s' is not known" + param-name operation-name)))))))) + +(comment + (cli.operation/parse-opts ["--port" "8080" "-h"] + server/default-http-server-config + :server)) diff --git a/src/core.clj b/src/core.clj new file mode 100644 index 0000000..271f943 --- /dev/null +++ b/src/core.clj @@ -0,0 +1,88 @@ +(ns core + (:require [clojure.java.io :as io] + [clojure.tools.logging :as log] + [core.json :as json] + [cli :as cli] + [ops :as ops] + [server :as server] + [core.deep-merge :as dm]) + (:gen-class) + (:import (org.slf4j LoggerFactory) + (ch.qos.logback.classic Logger Level))) + +(defn find-operation [operation-name cli-operations] + (first (filter (fn [op] (= (name operation-name) (:name op))) cli-operations))) + +(defn read-config-file [config-file] + (if (and config-file (.exists (io/file config-file))) + (json/read-file config-file) + (do + (when config-file + (log/warnf "Config file '%s' does not exists" config-file)) + {}))) + +(defn execute-op [operation-name options cli-operations] + (if operation-name + (let [operation (find-operation operation-name cli-operations) + resp (cond + (true? (:docs options)) (:docs operation) + (true? (:defaults options)) (:defaults operation))] + (println + (json/encode + (if-let [msg (if (empty? resp) + ((:handler-fn operation) options) + resp)] + msg + (format "Operation '%s' is finished" (name operation-name)))))) + (log/warnf "Operation name was not provided"))) + +(defn handle-subcommand [{:keys [options] :as cli-opts} cli-operations] + (try + (if-let [operation (get options :operation)] + (let [config-file (get options :config-file) + file-options (read-config-file config-file)] + (execute-op operation file-options cli-operations)) + (let [{{operation-name :name + {:keys [options arguments summary errors]} :conf + :as my-op} :operation} cli-opts] + (if (seq errors) + (println errors) + (if (or (:help options) (empty? options)) + (println (format "Help for '%s':\n" (name operation-name)) summary) + (let [configs-from-file (read-config-file (:config-file options)) + combined-conf (dm/deep-merge configs-from-file options)] + (execute-op operation-name combined-conf cli-operations)))))) + (catch Exception e + (println (format "Failed to execute with exception:\n '%s'" e)) + (.printStackTrace e)))) + +(def additional-operations + [{:name "server" + :handler-fn server/start + :docs (:doc (meta #'server/start)) + :defaults server/default-http-server-config}]) + +(def cli-operations + (concat ops/operations additional-operations)) + +(defn handle-cli [args] + (let [{:keys [options summary errors arguments] :as cli-opts} (cli/recursive-parse args cli-operations)] + (if errors + (println errors) + (if (or (get options :help) (and (empty? options) (empty? arguments))) + (println summary) + (handle-subcommand cli-opts cli-operations))))) + +(comment + (core/handle-cli ["-o" "foo" "-f" "a"]) + (core/handle-cli ["replay" "sink" "--connection.url=http://localhost:9200"]) + (core/handle-cli ["replay" "sink" "-h"]) + (core/handle-cli [])) + +(defn -main [& args] + (when-let [logger-level (System/getenv "ROOT_LOGGER_LEVEL")] + (.setLevel ^Logger + (LoggerFactory/getLogger Logger/ROOT_LOGGER_NAME) + (Level/valueOf (str logger-level)))) + (handle-cli args) + (shutdown-agents)) diff --git a/src/core/async.clj b/src/core/async.clj new file mode 100644 index 0000000..a65ea33 --- /dev/null +++ b/src/core/async.clj @@ -0,0 +1,34 @@ +(ns core.async + (:require [clojure.core.async :as a] + [clojure.core.async.impl.protocols :as impl])) + +(defn seq-of-chan + "Creates a lazy seq from a core.async channel." + [c] + (lazy-seq + (let [fst (a/ {:a {:b {:x "x", :y "y"}}} + ) diff --git a/src/core/http.clj b/src/core/http.clj new file mode 100644 index 0000000..8ee6b02 --- /dev/null +++ b/src/core/http.clj @@ -0,0 +1,12 @@ +(ns core.http + (:require [org.httpkit.client :as http]) + (:import (javax.net.ssl SSLParameters SSLEngine SNIHostName) + (java.net URI))) + +(defn sni-configure + [^SSLEngine ssl-engine ^URI uri] + (let [^SSLParameters ssl-params (.getSSLParameters ssl-engine)] + (.setServerNames ssl-params [(SNIHostName. (.getHost uri))]) + (.setSSLParameters ssl-engine ssl-params))) + +(def client (delay (http/make-client {:ssl-configurer sni-configure}))) diff --git a/src/core/ilm.clj b/src/core/ilm.clj new file mode 100644 index 0000000..3e7c15f --- /dev/null +++ b/src/core/ilm.clj @@ -0,0 +1,48 @@ +(ns core.ilm + (:require [core.json :as json] + [core.http :as http-client] + [org.httpkit.client :as http])) + +;TODO: set timeout and exponential backoff +(defn index-exists? [es-host index-name] + @(http/request + {:method :head + :client @http-client/client + :url (format "%s/%s" es-host index-name)} + (fn [resp] (not (= 404 (:status resp)))))) + +(defn set-refresh-interval! [dest-host dest-index interval-value] + @(http/request + {:method :put + :client @http-client/client + :url (format "%s/%s/_settings" dest-host dest-index) + :headers {"Content-Type" "application/json"} + :body (json/encode {"index.refresh_interval" interval-value})} + (fn [resp] (json/decode (:body resp))))) + +(defn create-index! + ([dest-host dest-index] (create-index! dest-host dest-index {})) + ([dest-host dest-index index-conf] + @(http/request + {:method :put + :client @http-client/client + :url (format "%s/%s" dest-host dest-index) + :headers {"Content-Type" "application/json"} + :body (json/encode index-conf)} + (fn [resp] (json/decode (:body resp)))))) + +(defn refresh-index! [dest-host dest-index] + @(http/request + {:method :get + :client @http-client/client + :url (format "%s/%s/_refresh" dest-host dest-index) + :headers {"Content-Type" "application/json"}} + (fn [resp] (json/decode (:body resp))))) + +(defn delete-index! [dest-host dest-index] + @(http/request + {:method :delete + :client @http-client/client + :url (format "%s/%s" dest-host dest-index) + :headers {"Content-Type" "application/json"}} + (fn [resp] (json/decode (:body resp))))) diff --git a/src/core/json.clj b/src/core/json.clj new file mode 100644 index 0000000..bc3102a --- /dev/null +++ b/src/core/json.clj @@ -0,0 +1,15 @@ +(ns core.json + (:require [jsonista.core :as json])) + +(defn decode + ([obj] (decode obj true)) + ([obj keywordize?] (json/read-value obj (json/object-mapper {:decode-key-fn keywordize?})))) + +(defn encode + ([obj] (encode obj {})) + ([obj object-mapper-opts] (json/write-value-as-string obj (json/object-mapper object-mapper-opts)))) + +(defn encode-vanilla [obj] (json/write-value-as-string obj)) + +(defn read-file [^String file] + (decode (slurp file))) diff --git a/src/core/pprint.clj b/src/core/pprint.clj new file mode 100644 index 0000000..5fcbfd7 --- /dev/null +++ b/src/core/pprint.clj @@ -0,0 +1,44 @@ +(ns core.pprint + (:require [clojure.pprint :as pprint] + [clojure.string :as str])) + +(defn keys-in + "Returns a sequence of all key paths in a given map using DFS walk." + [m] + (letfn [(children [node] + (let [v (get-in m node)] + (if (map? v) + (map (fn [x] (conj node x)) (keys v)) + []))) + (branch? [node] (-> (children node) seq boolean))] + (->> (keys m) + (map vector) + (mapcat #(tree-seq branch? children %))))) + +(defn kv-string + ([m] (kv-string m {})) + ([m opts] + (let [kseqs (keys-in m) + kseqs (if-let [max-depth (:depth opts)] + (remove (fn [kseq] (< max-depth (count kseq))) kseqs) + kseqs)] + (with-out-str + (pprint/print-table + (->> kseqs + (map (fn [kseq] + {:k (keyword (str/join "." (map name kseq))) + :v (get-in m kseq)})) + (remove (fn [m] (coll? (:v m)))) + (sort-by :k))))))) + +(defn paths-to-vals [m] + (let [paths (keys-in m)] + (filter (fn [path] (not (map? (get-in m path)))) paths))) + +(defn conf-merge + "Merges two maps not overriding values" + [conf-a conf-b] + (let [paths (paths-to-vals conf-b)] + (reduce (fn [acc path] + (let [value (get-in conf-b path)] + (assoc-in acc path value))) conf-a paths))) diff --git a/src/core/properties.clj b/src/core/properties.clj new file mode 100644 index 0000000..32b4884 --- /dev/null +++ b/src/core/properties.clj @@ -0,0 +1,17 @@ +(ns core.properties + (:require [clojure.string :as str]) + (:import (java.util Properties))) + +(defn ^Properties opts->properties [opts] + (reduce (fn [^Properties props [k v]] + (.put props ^String (name k) ^String v) + props) + (Properties.) + opts)) + +(defn opts-valid? [required-key opts] + (assert + (not (and (str/blank? (get opts required-key)) + (str/blank? (get opts (keyword required-key))))) + (format "Required kafka param='%s' option is not provided." + required-key))) diff --git a/src/ops.clj b/src/ops.clj new file mode 100644 index 0000000..cb5900e --- /dev/null +++ b/src/ops.clj @@ -0,0 +1,57 @@ +(ns ops + (:require [ops.es-to-es :as es-to-es] + [ops.es-to-kafka :as es-to-kafka] + [ops.es-to-ndjson :as es-to-ndjson] + [ops.kafka-to-es :as kafka-to-es] + [ops.kafka-to-kafka :as kafka-to-kafka] + [ops.kafka-to-ndjson :as kafka-to-ndjson] + [ops.krp-to-ndjson :as krp-to-ndjson] + [replay.core :as replay] + [replay.deep :as replay.deep] + [polyglot :as polyglot])) + +(def operations + [{:name "kafka-to-kafka" + :handler-fn kafka-to-kafka/execute + :docs (:doc (meta #'kafka-to-kafka/execute)) + :defaults kafka-to-kafka/default-opts} + {:name "kafka-to-ndjson" + :handler-fn kafka-to-ndjson/execute + :docs (:doc (meta #'kafka-to-ndjson/execute)) + :defaults kafka-to-ndjson/default-kafka-ndjson-config} + {:name "kafka-to-elasticsearch" + :handler-fn kafka-to-es/kafka-to-es + :docs (:doc (meta #'kafka-to-es/kafka-to-es)) + :defaults kafka-to-es/default-opts} + {:name "elasticsearch-to-elasticsearch" + :handler-fn es-to-es/reindex! + :docs (:doc (meta #'es-to-es/reindex!)) + :defaults es-to-es/default-reindex-config} + {:name "reindex" + :handler-fn es-to-es/reindex! + :docs (:doc (meta #'es-to-es/reindex!)) + :defaults es-to-es/default-reindex-config} + {:name "elasticsearch-to-kafka" + :handler-fn es-to-kafka/es->kafka + :docs (:doc (meta #'es-to-kafka/es->kafka)) + :defaults es-to-kafka/default-es-to-kafka-config} + {:name "elasticsearch-to-ndjson" + :handler-fn es-to-ndjson/es->ndjson + :docs (:doc (meta #'es-to-ndjson/es->ndjson)) + :defaults es-to-ndjson/default-es-to-ndjson-config} + {:name "krp-to-ndjson" + :handler-fn krp-to-ndjson/execute + :docs (:doc (meta #'krp-to-ndjson/execute)) + :defaults krp-to-ndjson/default-krp-ndjson-config} + {:name "replay" + :handler-fn replay/replay + :docs (:doc (meta #'replay/replay)) + :defaults replay/defaults} + {:name "deep-replay" + :handler-fn replay.deep/replay + :docs (:doc (meta #'replay.deep/replay)) + :defaults {}} + {:name "polyglot" + :handler-fn polyglot/apply-transformation + :docs (:doc (meta #'polyglot/apply-transformation)) + :defaults polyglot/polyglot-defaults}]) diff --git a/src/ops/es_to_es.clj b/src/ops/es_to_es.clj new file mode 100644 index 0000000..95e8ae0 --- /dev/null +++ b/src/ops/es_to_es.clj @@ -0,0 +1,50 @@ +(ns ops.es-to-es + (:require [clojure.tools.logging :as log] + [core.deep-merge :as deep-merge] + [sink.elasticsearch.index :as elasticsearch] + [source.elasticsearch.records :as records])) + +(def default-reindex-config + {:max_docs nil + :source {:remote {:connect_timeout "10s" + :host "http://localhost:9200" + :socket_timeout "1m"} + :index "source" + :query {:sort ["_doc"] + :size 5000}} + :sink (deep-merge/deep-merge + elasticsearch/default-opts + {:batch.size 1000 + :dest.index "reindex_sink_index" + :max.in.flight.requests 200 + :read.timeout.ms 60000 + :retry.backoff.ms 1000 + :flush.timeout.ms 100000})}) + +(defn reindex! + "Indexes data from one or many Elasticsearch indices + to some specified index. + Reindexing can be done between multiple ES clusters." + [opts] + (log/infof "Starting reindexing with config: '%s'" opts) + (elasticsearch/store! (records/fetch opts default-reindex-config) + (merge (:sink default-reindex-config) + (-> opts + :sink + (update :connection.url (fn [connection-url] + (if (nil? connection-url) + (get-in opts [:dest :remote :host]) + connection-url))) + (update :dest.index (fn [dest-index] + (if (nil? dest-index) + (get-in opts [:dest :index]) + dest-index)))))) + (log/infof "Finished reindexing: '%s'" opts)) + +(comment + (ops.es-to-es/reindex! + {:max_docs 1000 + :source {:remote {:host "http://localhost:9200"} + :index ".kibana"} + :dest {:index ".kibana-backup" + :remote {:host "http://localhost:9200"}}})) diff --git a/src/ops/es_to_kafka.clj b/src/ops/es_to_kafka.clj new file mode 100644 index 0000000..7d188a4 --- /dev/null +++ b/src/ops/es_to_kafka.clj @@ -0,0 +1,50 @@ +(ns ops.es-to-kafka + (:require [sink :as sink] + [source :as source]) + (:import (sink KafkaRecord))) + +(def default-es-to-kafka-config + {:max_docs nil + :source {:implementation :elasticsearch + :remote {:connect_timeout "10s" + :host "http://localhost:9200" + :socket_timeout "1m"} + :index "*" + :query {:sort ["_doc"] + :size 2000} + :keywordize? false} + :dest {} + :sink {:implementation :kafka + :topic "sink-topic" + :bootstrap.servers "127.0.0.1:9092"}}) + +(defn es->kafka [opts] + (sink/store! + (map (fn [es-record] + (KafkaRecord. + (get es-record :_id (get es-record "_id")) + (get es-record :_source (get es-record "_source")) + (dissoc es-record :_id "_id" :_source "_source"))) + (source/fetch! (assoc opts :source (merge (:source default-es-to-kafka-config) + (:source opts))))) + (assoc opts :sink + (merge (:sink default-es-to-kafka-config) + (:sink opts))))) + +(comment + ; all records from Elasticsearch + (es->kafka + {:sink {:topic "es-tool-sink" + :bootstrap.servers "127.0.0.1:9092"}}) + ; max 10000 and selected records from Elasticsearch + (es->kafka + {:max_docs 10000 + :source {:remote {:connect_timeout "10s" + :host "http://localhost:9200" + :socket_timeout "1m"} + :index ".kibana" + :query {:sort ["_doc"] + :size 2000} + :keywordize? false} + :sink {:topic "kibana-data" + :bootstrap.servers "127.0.0.1:9092"}})) diff --git a/src/ops/es_to_ndjson.clj b/src/ops/es_to_ndjson.clj new file mode 100644 index 0000000..7e24129 --- /dev/null +++ b/src/ops/es_to_ndjson.clj @@ -0,0 +1,34 @@ +(ns ops.es-to-ndjson + (:require [core.json :as json] + [sink :as sink] + [source :as source])) + +(def default-es-to-ndjson-config + {:max_docs nil + :source {:implementation :elasticsearch + :remote {:host "http://localhost:9200"} + :index "*" + :query {:sort ["_doc"] + :size 2000}} + :sink {:implementation :file}}) + +(defn es->ndjson [opts] + (sink/store! + (mapcat (fn [hit] + [{:value (json/encode {:index (select-keys hit [:_index :_id :_type])})} + (assoc hit :value (json/encode (:_source hit)))]) + (source/fetch! (assoc opts :source (merge (:source default-es-to-ndjson-config) + (:source opts))))) + (assoc opts :sink + (merge (:sink default-es-to-ndjson-config) + (:sink opts))))) + +(comment + (es->ndjson + {:max_docs 1 + :source {:implementation :elasticsearch + :remote {:host "http://localhost:9200"} + :index ".kibana" + :query {:sort ["_doc"] + :size 2000}} + :sink {:implementation :file}})) diff --git a/src/ops/kafka_to_es.clj b/src/ops/kafka_to_es.clj new file mode 100644 index 0000000..f06f6d7 --- /dev/null +++ b/src/ops/kafka_to_es.clj @@ -0,0 +1,41 @@ +(ns ops.kafka-to-es + (:require [sink.elasticsearch.index :as indexer] + [source :as source])) + +(def default-opts + {:max_docs 5 + :source {:implementation :kafka + :bootstrap.servers "127.0.0.1:9092" + :decode-value? false} + :dest {:remote {:connect_timeout "10s" + :host "http://localhost:9200" + :socket_timeout "1m"}} + :sink (merge {:implementation :elasticsearch} + indexer/default-opts)}) + +(defn kafka-to-es [opts] + (let [es-host (get-in opts [:dest :remote :host]) + index-name (-> opts :dest :index) + kafka-records (source/fetch! (assoc opts :source (merge (:source default-opts) + (:source opts))))] + (indexer/store! (map (fn [record] + (indexer/->EsRecord + (if-let [k (:key record)] + k + (str (:topic record) "+" (:partition record) "+" (:offset record))) + (:value record))) + kafka-records) + (merge + (:sink default-opts) + {:connection.url es-host + :dest.index index-name + :already.encoded true})))) + +(comment + (kafka-to-es + {:max_docs 1 + :source {:topic "source-topic" + :bootstrap.servers "127.0.0.1:9092"} + :dest {:index "dest-index-name" + :remote {:host "http://localhost:9200"}} + :sink {}})) diff --git a/src/ops/kafka_to_kafka.clj b/src/ops/kafka_to_kafka.clj new file mode 100644 index 0000000..3e5fade --- /dev/null +++ b/src/ops/kafka_to_kafka.clj @@ -0,0 +1,28 @@ +(ns ops.kafka-to-kafka + (:require [sink :as sink] + [source :as source])) + +(def default-opts + {:max_docs 5 + :source {:implementation :kafka + :bootstrap.servers "127.0.0.1:9092" + :decode-value? false} + :sink {:implementation :kafka + :bootstrap.servers "127.0.0.1:9092" + :encode-value? false}}) + +(defn execute + "Read records from a Kafka topic(s) and writes them another topic." + [opts] + (let [source-opts (merge (:source default-opts) (:source opts)) + sink-opts (merge (:sink default-opts) (:sink opts)) + records (source/fetch! (assoc opts :source source-opts))] + (sink/store! records (assoc opts :sink sink-opts)))) + +(comment + (execute + {:max_docs 1 + :source {:topic "source-topic" + :bootstrap.servers "127.0.0.1:9092"} + :sink {:topic "sink-topic" + :bootstrap.servers "127.0.0.1:9092"}})) diff --git a/src/ops/kafka_to_ndjson.clj b/src/ops/kafka_to_ndjson.clj new file mode 100644 index 0000000..8ab82ff --- /dev/null +++ b/src/ops/kafka_to_ndjson.clj @@ -0,0 +1,33 @@ +(ns ops.kafka-to-ndjson + (:require [core.json :as json] + [sink :as sink] + [source :as source])) + +(def default-kafka-ndjson-config + {:max_docs nil + :source {:implementation :kafka + :bootstrap.servers "127.0.0.1:9092" + :decode-value? true} + :sink {:implementation :file}}) + +(defn execute [opts] + (sink/store! + (mapcat (fn [record] + [{:value (json/encode {:index {:_id (:key record) + :_index (:topic record)}})} + (assoc record :value (json/encode (:value record)))]) + (source/fetch! (assoc opts :source (merge (:source default-kafka-ndjson-config) + (:source opts))))) + (assoc opts :sink + (merge (:sink default-kafka-ndjson-config) + (:sink opts))))) + +(comment + (execute + {:max_docs nil + :source {:bootstrap.servers "127.0.0.1:9092" + :topic "topic-name" + :impatient? true + :retry-count 2} + :sink {:implementation :file + :filename "es-docs.ndjson"}})) diff --git a/src/ops/krp_to_ndjson.clj b/src/ops/krp_to_ndjson.clj new file mode 100644 index 0000000..f9b064a --- /dev/null +++ b/src/ops/krp_to_ndjson.clj @@ -0,0 +1,63 @@ +(ns ops.krp-to-ndjson + (:require [core.json :as json] + [core.async :as async] + [sink :as sink] + [source :as source] + [clojure.tools.logging :as log])) + +(def default-krp-ndjson-config + {:max_docs nil + :source {:implementation :krp + :connection.url "http://localhost:8082"} + :sink {:implementation :file + :filename nil + :partition-size nil}}) + +(defn one-consumer [opts] + (sink/store! + (mapcat (fn [record] + [{:value (json/encode {:index {:_id (:key record) + :_index (:topic record)}})} + (assoc record :value (json/encode (:value record)))]) + (source/fetch! (assoc opts :source (merge (:source default-krp-ndjson-config) + (:source opts))))) + (assoc opts :sink + (merge (:sink default-krp-ndjson-config) + (:sink opts))))) + +(defn execute [opts] + (let [concurency (-> opts :source :concurrency) + max-docs-per-partition (int (Math/ceil (/ (-> opts :max_docs) concurency)))] + (if concurency + (doseq [fut (map (fn [thread-nr] + (future + (log/infof "Started KRP consumer '%s'" thread-nr) + (one-consumer (-> opts + (assoc-in [:max_docs] max-docs-per-partition) + (assoc-in [:source :partitions] [thread-nr]) + (update-in [:source :consumer.name] (fn [consumer-instance] (str thread-nr "-" consumer-instance))) + (update-in [:sink :filename] (fn [filename] (str filename "-" thread-nr))))))) + (range concurency))] + @fut) + (one-consumer opts)))) + +(comment + (execute + {:max_docs nil + :source {:connection.url "http://localhost:8082" + :topic "keywords-test-topic" + :offset 0} + :sink {:filename "ndjson/ket-docs.ndjson" + :partition-size 10000}}) + + (execute + {:max_docs 2000 + :source {:connection.url "http://localhost:9200" + :topic "deep-replay" + :group.id "krp-group2_instance" + :consumer.name "krp2_instance" + :offset 0 + :concurrency 20 + :delete.consumer.instance false} + :sink {:filename "directory/docs" + :partition-size 1000}})) diff --git a/src/polyglot.clj b/src/polyglot.clj new file mode 100644 index 0000000..4cfc1a0 --- /dev/null +++ b/src/polyglot.clj @@ -0,0 +1,74 @@ +(ns polyglot + (:require [clojure.string :as s] + [clojure.java.io :as io] + [core.json :as json] + [polyglot.js :as js] + [polyglot.sci :as sci]) + (:import (java.io File))) + +(defn apply-js-transformation + "Given some JSON encoded data and a JS code snippet, GraalVM is going to interpret the JS code + and apply it to the JSON encoded data. + + Input: {:data \"{\"my\": \"data\"}\" :script (s) => s\"}" + [{:keys [data script]}] + (js/string->string data script)) + +(defn apply-sci-transformation + "Given some JSON encoded data and a SCI code snippet, GraalVM is going to interpret the SCI code + and apply it to the JSON encoded data. + + Input: {:data \"{\"my\": \"data\"}\" :script \"(fn [m] (assoc m :foo :bar))\"}" + [{:keys [data script]}] + (sci/string->string data script)) + +(comment + (polyglot/apply-sci-transformation + {:data "{\"my\": \"data\"}" + :script "(fn [m] (assoc m :foo :bar))"})) + +(defn apply-transformation + "Given some JSON encoded data and a JS code snippet, GraalVM is going to interpret the JS code + and apply it to the JSON encoded data. + Also, script can be provided with a reference to a file. + + Input: {:lang \"js\" :data \"{\"my\": \"data\"}\" :script \"(s) => s\"}" + [{:keys [lang file script] :as req}] + (assoc req :result + (if (and (nil? script) (nil? file)) + {:error "Neither script nor file are provided"} + (if (and (nil? script) (not (.exists ^File (io/file file)))) + {:error (format "Provided file '%s' doesn't exist" file)} + (let [lang-code (if (string? lang) + (-> lang (s/trim) (s/lower-case) keyword) + lang) + req (cond-> req + (and (nil? script) (.exists ^File (io/file file))) (assoc :script (slurp file)))] + (case lang-code + :js (apply-js-transformation req) + :sci (apply-sci-transformation req) + {:error (format "Language '%s' is not supported." lang)})))))) + +(def polyglot-defaults + {:data "{\"my\": \"data\"}" + :lang "js" + :script "(s) => s" + :file "/path/to/file"}) + +(comment + (polyglot/apply-transformation polyglot-defaults)) + +(defn map->map [m fsc] + (json/decode (js/string->string (json/encode m) fsc))) + +(defn append-timestamp-to-map + "Map keys must be strings." + [m] + (js/transform-map-js m "(m) => {var target = Object.assign({}, m);target['timestamp'] = new Date().toISOString(); return target;}")) + +(comment + (append-timestamp-to-map {"graalvm and clojure" "rocks!"}) + ;=> {"graalvm and clojure" "rocks!", "timestamp" "2020-06-14T22:34:12.768Z"} + ;;Native image compiler can compile these commands 137 OOM + ;:polyglot (polyglot/append-timestamp-to-map options) + ) diff --git a/src/polyglot/js.clj b/src/polyglot/js.clj new file mode 100644 index 0000000..2f58791 --- /dev/null +++ b/src/polyglot/js.clj @@ -0,0 +1,55 @@ +(ns polyglot.js + (:require [core.json :as json]) + (:import (org.graalvm.polyglot Value Context HostAccess) + (java.util Map) + (org.graalvm.polyglot.proxy ProxyObject))) + +(defn transform-map-js [m function-source-code] + (let [^Context ctx (-> (Context/newBuilder (into-array String ["js"])) + (.allowHostAccess HostAccess/ALL) + (.build)) + initial-proxy (ProxyObject/fromMap m) + ^Value f (.eval ctx "js" function-source-code)] + (.as ^Value (.execute ^Value f (object-array [initial-proxy])) ^Class Map))) + +(defn string->string [s transformation-source-code] + (let [^Context ctx (-> (Context/newBuilder (into-array String ["js"])) + (.allowHostAccess HostAccess/ALL) + (.build)) + _ (.putMember (.getBindings ctx "js") "m" s) + ^Value f (.eval ctx "js" (format "JSON.stringify((%s)(JSON.parse(m)))" transformation-source-code))] + (.asString ^Value f))) + +; https://www.graalvm.org/reference-manual/embed-languages/ +(defn script->transform-fn + "Given a JavaScript source code snippet creates a function that expects + string that is going to be passed into that function as an argument. + Returns a string." + [^String script] + (let [^Context ctx (.build (Context/newBuilder (into-array String ["js"]))) + ^String wrapped-script (format "(query) => JSON.stringify((%s)(JSON.parse(query)))" script) + ^Value f (.eval ctx "js" wrapped-script)] + (fn [^String value] (.asString (.execute f (object-array [value])))))) + +(defn script->transform-fn-vals + "Given a JavaScript source code snippet creates a function that expects + string that is going to be passed into that function as an argument. + Returns a string." + [^String script] + (let [^Context ctx (.build (Context/newBuilder (into-array String ["js"]))) + ^String wrapped-script (format "(...args) => { args[0] = JSON.parse(args[0]); return JSON.stringify((%s).apply(null, args));}" script) + ^Value f (.eval ctx "js" wrapped-script)] + (fn [& vals] (.asString (.execute f (object-array vals)))))) + +(comment + (time + (let [tf (time (polyglot.js/script->transform-fn + "(request) => {const deepEqual=(e,t)=>{const r=Object.keys(e),n=Object.keys(t);if(r.length!==n.length)return!1;for(const n of r){const r=e[n],s=t[n],a=isObject(r)&&isObject(s);if(a&&!deepEqual(r,s)||!a&&r!==s)return!1}return!0},isObject=e=>null!==e&&\"object\"==typeof e,remove=[...request.query.bool.filter].filter(e=>e.range).filter(e=>Object.keys(e.range).includes(\"created_at\")||Object.keys(e.range).includes(\"user_updated_at\")).filter(e=>{const t=e.range.created_at&&1===Object.keys(e.range.created_at).length&&Object.keys(e.range.created_at).includes(\"lte\"),r=e.range.user_updated_at&&1===Object.keys(e.range.user_updated_at).length&&Object.keys(e.range.user_updated_at).includes(\"lte\");return t||r}),transform=[...remove].map(e=>{const t=Object.keys(e.range)[0];return{range:{[t]:{gt:e.range[t].lte}}}});request.query.bool.filter=request.query.bool.filter.filter(e=>!remove.some(t=>deepEqual(e,t))),request.query.bool.must_not.push(...transform);return request;}")) + data "{\"_source\":false,\"stats\":[\"regular_items_lookup\"],\"query\":{\"bool\":{\"must\":[],\"filter\":[{\"range\":{\"created_at\":{\"lte\":\"2020-12-23T15:55:00+01:00\",\"gte\":\"2020-12-14T15:55:00+01:00\"}}},{\"range\":{\"user_updated_at\":{\"lte\":\"2020-12-23T15:55:00+01:00\"}}},{\"terms\":{\"country_id\":[7,10,16,18,19,20]}},{\"terms\":{\"catalog_tree_ids\":[1653]}}],\"must_not\":[{\"exists\":{\"field\":\"stress_test\"}},{\"exists\":{\"field\":\"user_shadow_banned\"}},{\"bool\":{\"must_not\":{\"term\":{\"country_id\":16}},\"filter\":{\"range\":{\"international_visibility_enabled_from\":{\"gt\":\"2020-12-23T15:55:00+01:00\"}}}}}],\"should\":[{\"bool\":{\"filter\":{\"range\":{\"user_updated_at\":{\"gte\":\"2020-11-23\"}}},\"must\":[{\"distance_feature\":{\"field\":\"user_updated_at\",\"origin\":\"2020-12-23T15:55:00+01:00\",\"pivot\":\"7d\",\"boost\":1.25}}]}},{\"constant_score\":{\"filter\":{\"range\":{\"promoted_until\":{\"gte\":\"2020-12-23T15:55:00+01:00\"}}},\"boost\":1.8}},{\"bool\":{\"must_not\":{\"term\":{\"country_id\":16}},\"must\":{\"range\":{\"international_visibility_enabled_from\":{\"boost\":1.0,\"lte\":\"2020-12-23T15:55:00+01:00\",\"gte\":\"2020-12-23T07:55:00+01:00\"}}}}}]}},\"from\":0,\"size\":24,\"rescore\":[]}"] + (dotimes [i 10000] + (tf data)))) + + ; The cool trick can be that you encode many values in a string, and in your function parse it out. + (let [tf (polyglot.js/script->transform-fn-vals + "(x, y) => {return [x, y]}")] + (tf (json/encode {:a "1"}) "2" 3 4))) diff --git a/src/polyglot/sci.clj b/src/polyglot/sci.clj new file mode 100644 index 0000000..b6ca30c --- /dev/null +++ b/src/polyglot/sci.clj @@ -0,0 +1,27 @@ +(ns polyglot.sci + (:require [sci.core :as sci] + [core.json :as json])) + +(defn sci-compile [^String script] + (sci/eval-string script)) + +(defn script->transform-fn [script] + (let [transform-fn (sci-compile script)] + (fn [^String value] + (json/encode (transform-fn (json/decode value)))))) + +(defn apply-sci-transform [m script] + (let [transform-f (sci-compile script)] + (transform-f m))) + +(defn string->string [s transform-script] + (json/encode (apply-sci-transform (json/decode s) transform-script))) + +; specializations for multi argument functions +(defn script->transform-fn-for-boost [script] + (let [transform-fn (sci-compile script)] + (fn [& args] + (apply transform-fn args)))) + +(comment + ((polyglot.sci/script->transform-fn-for-boost "(fn [& args] args)") 1 2 3)) diff --git a/src/replay/core.clj b/src/replay/core.clj new file mode 100644 index 0000000..5158f16 --- /dev/null +++ b/src/replay/core.clj @@ -0,0 +1,126 @@ +(ns replay.core + (:require [clojure.core.async :as a] + [clojure.tools.logging :as log] + [org.httpkit.client :as http] + [core.async :as async] + [core.http :as http-client] + [core.json :as json] + [sink.elasticsearch.index :as es-sink] + [source.elasticsearch :as es] + [replay.transform.uri :as transform-uri] + [replay.transform.query :as transform-query]) + (:import (java.time Instant) + (java.util UUID))) + +(defn hits-count [resp-body] + (let [hits-total (get-in resp-body [:hits :total])] + (if (number? hits-total) + {:value hits-total + :relation "eq"} + hits-total))) + +(defn post-process [input-doc endpoint query start resp replay-conf] + (let [replay-data-kw (keyword (or (get replay-conf :replay_data_attr) :replay)) + decoded-body (json/decode (get resp :body))] + (-> input-doc + (assoc-in [:_id] (str (UUID/randomUUID))) + (assoc-in [:_source replay-data-kw :config] (json/encode replay-conf)) + (assoc-in [:_source replay-data-kw :modified-query] query) + (assoc-in [:_source replay-data-kw :id] (:id replay-conf)) + (assoc-in [:_source replay-data-kw :query-id] (:_id input-doc)) + (assoc-in [:_source replay-data-kw :timestamp] (str (Instant/now))) + (assoc-in [:_source replay-data-kw :endpoint] endpoint) + (assoc-in [:_source replay-data-kw :response] (get resp :body)) + (assoc-in [:_source replay-data-kw :service-time] (- (System/currentTimeMillis) start)) + (assoc-in [:_source replay-data-kw :hits] (hits-count decoded-body)) + (assoc-in [:_source replay-data-kw :es-time] (:took decoded-body))))) + +(defn query-es-afn [conf] + (let [replay-conf (:replay conf) + es-host (:connection.url replay-conf) + transform-fn (transform-query/transform-fn (:query-transforms replay-conf)) + original-query-key (keyword (:query_attr replay-conf))] + (fn [{source :_source :as input-doc} channel] + (let [endpoint (transform-uri/construct-endpoint source replay-conf) + original-query (get source original-query-key) + query (transform-fn original-query) + start (System/currentTimeMillis)] + (http/request + {:method :get + :headers {"Content-Type" "application/json"} + :client @http-client/client + :url (format "%s%s" es-host endpoint) + :body query} + (fn [resp] + (when (string? (get resp :body)) + (a/>!! channel (post-process input-doc endpoint + (when-not (= original-query query) query) + start resp replay-conf))) + (a/close! channel))))))) + +(def defaults + {:max_docs 10 + :source {:remote {:host "http://localhost:9200"} + :index "query_logs_index"} + :replay {:id "id-of-the-replay" + :description "Description of the query replay." + :query_attr "request" + :uri_attr "uri" + :replay_data_attr "replay" + :uri-transforms [] + :query-transforms [] + :connection.url "http://localhost:9200" + :concurrency 1 + :repeats 1} + :sink {:connection.url "http://localhost:9200" + :dest.index "replay_sink_index" + :batch.size 50}}) + +(defn prepare-replay-conf [conf] + (let [initial-conf (:replay conf) + with-defaults (merge (:replay defaults) initial-conf)] + (cond-> with-defaults + (nil? (:id initial-conf)) (assoc :id (str (UUID/randomUUID)))))) + +(defn replay + "Take some queries from an Elasticsearch cluster (transform then) replay the the queries + to another Elasticsearch cluster, and store the responses in yet another Elasticsearch + cluster." + [conf] + (log/infof "Starting a replay with conf: '%s'" conf) + (let [replay-conf (prepare-replay-conf conf) + concurrency (:concurrency replay-conf) + repeats (:repeats replay-conf) + queries (mapcat (fn [query] (repeat repeats query)) (es/fetch conf)) + responses (async/map-pipeline-async (query-es-afn (assoc conf :replay replay-conf)) + concurrency queries)] + (es-sink/store! responses (:sink conf)))) + +(comment + (replay {:max_docs 10 + :source {:remote {:host "http://localhost:9200"} + :index "query_logs" + :query {:query {:bool + {:filter + [{:range {:header.timestamp {:gte "2020-07-30T00:58:12+02:00" + :lte "2020-07-30T23:00:00+02:00"}}} + {:range {:response_took {:gte 200}}} + {:match {:request "function_score"}} + {:prefix {:uri.keyword "/index_name/_search"}}] + :must_not + [{:match {:body "keyword"}}]}} + :sort [{:header.timestamp {:order :asc}}] + :size 1000}} + :replay {:id "int-ids-count" + :description "replay for integer IDs" + :query_attr "request" + :uri_attr "uri" + :replay_data_attr "replay" + :uri-transforms [] + :query-transforms [] + :connection.url "http://localhost:9200" + :concurrency 10 + :repeats 1} + :sink {:connection.url "http://localhost:9200" + :dest.index "dest-index" + :batch.size 50}})) diff --git a/src/replay/deep.clj b/src/replay/deep.clj new file mode 100644 index 0000000..ff4b1ec --- /dev/null +++ b/src/replay/deep.clj @@ -0,0 +1,107 @@ +(ns replay.deep + (:require [clojure.core.async :as a] + [clojure.tools.logging :as log] + [core.async :as async] + [core.json :as json] + [source.elasticsearch :as es] + [sink :as sink]) + (:import (java.time Instant))) + +(def DEFAULT_DEPTH 1) +(def DEFAULT_PAGE_SIZE 5000) + +(defn prepare-query [query replay-conf] + (-> query + (assoc :size (min (or (:depth replay-conf) DEFAULT_DEPTH) DEFAULT_PAGE_SIZE)) + (assoc :_source true) + (assoc :explain true) + (assoc :sort ["_score" {:created_at "desc"}]))) + +(defn query-es-afn [conf] + (let [replay-conf (:replay conf) + depth (or (:depth replay-conf) DEFAULT_DEPTH) + query-log-host (-> conf :source :remote :host) + dest-es-host (:connection.url replay-conf) + doc-fetch-strategy (or (:doc-fetch-strategy replay-conf) :search-after-with-pit)] + (fn [query-log-entry channel] + (let [index-name (or (:target-index replay-conf) + (-> query-log-entry :fields :uri.index first)) + query (-> query-log-entry :_source :request json/decode (prepare-query replay-conf)) + hits (es/fetch {:max_docs depth + :source {:remote {:host dest-es-host} + :index index-name + :query query + :strategy doc-fetch-strategy}})] + (sink/store! (map (fn [resp rank] + {:key (format "%s:%s:%s" (:id replay-conf) (:_id query-log-entry) rank) + :value {:replay_id (:id replay-conf) + :query_log_host query-log-host + :query_log_id (:_id query-log-entry) + :x_user_id (-> query-log-entry + :_source + :request_headers + :x-user-id + first) + :query_body (-> query-log-entry :_source :request) + :query-timestamp (str (Instant/ofEpochMilli + (-> query-log-entry + :_source + :header.timestamp))) + :replay-timestamp (str (Instant/now)) + :replay_host dest-es-host + :rank rank + :hit resp} + :headers {}}) + hits (range)) + conf) + (a/>!! channel (:_id query-log-entry)) + (a/close! channel))))) + +(defn replay + "From an Elasticsearch cluster takes some queries, replays them + to (another) Elasticsearch cluster with top-k results where k might be very big, like 1M. + Each hit with the metadata is written to a specified Kafka topic. + URIs can be transformed, queries can be transformed." + [conf] + (let [replay-conf (:replay conf) + concurrency (or (:concurrency replay-conf) 50) + queries (es/fetch conf) + replays (async/map-pipeline-async (query-es-afn conf) concurrency queries)] + (doseq [replay replays] + (log/debugf "Replayed query: %s" replay)))) + +(comment + (replay.deep/replay + {:max_docs 10 + :source {:remote {:host "http://localhost:9200"} + :index "query_logs" + :query {:query {:bool + {:filter + [{:term {:query_from {:value 0}}} + {:term {:stats {:value "keyword_value"}}} + {:range {:header.timestamp {:gte "now-1m"}}} + {:match {:request "multi_match"}} + {:prefix {:uri.keyword "/index_name/_search"}}] + :must_not + [{:exists {:field "query_sort"}}]}} + :sort [{:header.timestamp {:order :asc}}] + :docvalue_fields ["uri.index"] + :size 10}} + :replay {:id "my-replay-id" + :description "my description" + :query_attr "request" + :uri_attr "uri" + :replay_data_attr "replay" + :uri-transforms [{:match "foo" + :replacement "bar"}] + :query-transforms [] + :connection.url "http://localhost:9200" + :target-index "target-index-name" + :doc-fetch-strategy :search-after-with-pit + :concurrency 10 + :repeats 1 + :depth 10000} + :sink {:implementation :kafka + :topic "deep-replay-topic" + :bootstrap.servers "127.0.0.1:9092" + :linger.ms 1000}})) diff --git a/src/replay/impact.clj b/src/replay/impact.clj new file mode 100644 index 0000000..b2d1e0d --- /dev/null +++ b/src/replay/impact.clj @@ -0,0 +1,157 @@ +(ns replay.impact + (:require [clojure.tools.logging :as log] + [core.async :as async] + [core.json :as json] + [core.deep-merge :as dp] + [scroll.request :as r] + [scroll.pit :as pit] + [sink.elasticsearch.index :as es-sink] + [source.elasticsearch :as es] + [replay.transform.uri :as transform.uri] + [replay.transform.impact :as impact-transform])) + +(set! *warn-on-reflection* true) + +; https://www.elastic.co/guide/en/elasticsearch/reference/current/search-rank-eval.html + +(defn get-index-or-alias [endpoint] + (last (re-find #"^/(.*)/_search" endpoint))) + +(defn prepare-endpoint [^String endpoint] + (transform.uri/transform-uri + endpoint + [{:match "preference=[^&]*&?" + :replacement ""} + {:match "routing=[^&]*&?" + :replacement ""} + {:match "^(/.*)(/.*)" + :replacement "$2"} + {:match "\\?$" + :replacement ""}])) + +(defn generate-queries [opts query-body] + (impact-transform/generate-queries query-body (get-in opts [:replay :query-transforms]))) + +(defn measure-impact [opts query-log-entry] + (let [es-host (get-in opts [:replay :connection.url]) + raw-endpoint (get-in query-log-entry [:_source :uri]) + endpoint (prepare-endpoint raw-endpoint) + target-index (get-index-or-alias raw-endpoint) + pit (assoc (pit/init es-host target-index opts) :keep_alive "30s") + query-string (get-in query-log-entry [:_source :request]) + query-body (json/decode query-string) + url (format "%s%s" es-host endpoint) + k (get-in opts [:replay :top-k]) + baseline-resp (r/execute-request + {:url url + :body (assoc query-body :pit pit :size k) + :opts (assoc r/default-exponential-backoff-params :keywordize? true) + :method :get + :headers r/default-headers})] + (let [metric {:precision {:k k :relevant_rating_threshold 1 :ignore_unlabeled false}} + ratings (map (fn [hit] (assoc (select-keys hit [:_index :_id]) :rating 1)) + (get-in baseline-resp [:hits :hits])) + target-url (format "%s/%s/_rank_eval" es-host target-index) + query-variations (generate-queries opts query-body) + grouped-variations (group-by (fn [qv] (json/encode (:variation qv))) + (map (fn [qv] (update qv :request assoc :size k)) query-variations)) + rank-eval-resp (r/execute-request + {:url target-url + :body {:requests (map (fn [[id [{request :request}]]] + {:id id + :request (assoc request :pit pit) + :ratings ratings}) + grouped-variations) + :metric metric} + :opts (assoc r/default-exponential-backoff-params :keywordize? true) + :method :get + :headers r/default-headers})] + (let [{:keys [details failures metric_score]} rank-eval-resp] + (map (fn [variation-id] + (let [query-log-entry-id (get query-log-entry :_id) + variation (first (get grouped-variations (name variation-id)))] + (-> query-log-entry + (update :_id (fn [replay-log-entry-id] (str replay-log-entry-id "-" (hash variation-id)))) + (assoc-in [:_source :query_log_entry_id] query-log-entry-id) + (assoc-in [:_source :impact] {:top-k k + :variation-id (name variation-id) + :variation (map (fn [variation-map] + (update variation-map :value str)) + (:variation variation)) + :query (json/encode (:request variation)) + :failures failures + :impact (float (- 1 (:metric_score (variation-id details)))) + :average-impact (float (- 1 metric_score)) + :hit-count (count (get-in details [variation-id :hits])) + :unrelated-count (count (get-in details [variation-id :unrated_docs])) + :metric-score (get-in details [variation-id :metric_score]) + :original-hit-count (count ratings) + :details (json/encode (get details variation-id))})))) + (keys details)))))) + +(def defaults + {:max_docs 1 + :source {:remote {:host "http://localhost:9200"} + :index "query_logs_index"} + :replay {:id "id-of-the-replay" + :description "Description of the query replay." + :query_attr "request" + :uri_attr "uri" + :replay_data_attr "replay" + :uri-transforms [] + :top-k 10 + :query-transforms [] + :connection.url "http://localhost:9200" + :concurrency 1} + :sink {:connection.url "http://localhost:9200" + :dest.index "impact_sink_index" + :batch.size 50}}) + +(defn prepare-replay-conf [conf] + (dp/deep-merge defaults conf)) + +(defn execute + "Fetches baseline query and for several boost values transforms query, + invokes _rank_eval API for metrics on what is the impact of the query transforms to the ranking. + Impact is defined as 1 minus precision-at-K." + [conf] + (log/infof "Starting a replay for impact with conf: '%s'" conf) + (let [replay-conf (prepare-replay-conf conf) + concurrency (get-in replay-conf [:replay :concurrency]) + queries (es/fetch conf) + responses (async/map-pipeline (fn [query-log-entry] + (measure-impact replay-conf query-log-entry)) + concurrency queries)] + (es-sink/store! (apply concat responses) (:sink conf)))) + +(comment + (replay.impact/execute + {:max_docs 100 + :source {:remote {:host "http://localhost:9200"} + :index "query_logs" + :query {:query {:bool + {:filter + [{:term {:query_from {:value 0}}} + {:term {:stats {:value "some value"}}} + {:range {:header.timestamp {:gte "now-2d"}}} + {:match {:request "multi_match"}} + {:prefix {:uri.keyword "/index_name/_search"}}] + :must_not + [{:exists {:field "query_sort"}}]}} + :sort [{:header.timestamp {:order :asc}}] + :docvalue_fields ["uri.index"] + :size 1}} + :replay {:connection.url "http://localhost:9200" + :concurrency 10 + :top-k 100 + :query-transforms [{:id "test" + :lang :sci + :script "(fn [query boost]\n (let [query-string (-> query\n (get-in [:query :bool :must])\n first\n (get-in [:constant_score :filter :multi_match :query]))\n clause-to-add {:constant_score {:boost boost\n :filter {:match {:title.folded {:_name \"boost_on_exactness\"\n :operator \"and\"\n :query query-string}}}}}]\n (update-in query [:query :bool :should] conj clause-to-add)))" + :vals [0.00001 0.0001 0.001 0.01 0.1 1 10 100 1000 10000]} + {:id "test2" + :lang :sci + :script "(fn [query boost] query)" + :vals [123]}]} + :sink {:connection.url "http://localhost:9200" + :dest.index "impact_sink_index" + :batch.size 50}})) diff --git a/src/replay/transform/impact.clj b/src/replay/transform/impact.clj new file mode 100644 index 0000000..6c54a3b --- /dev/null +++ b/src/replay/transform/impact.clj @@ -0,0 +1,72 @@ +(ns replay.transform.impact + (:require [polyglot.sci :as sci] + [polyglot.js :as js] + [core.json :as json])) + +(defn cart [colls] + (if (empty? colls) + '(()) + (for [more (cart (rest colls)) + x (first colls)] + (cons x more)))) + +(defn create-transformation-variations + [compiled-transformation] + (let [specialize (fn [compiled-transformation] + (reduce (fn [transformations-variation value] + (conj transformations-variation + (-> compiled-transformation + (assoc :value value) + (dissoc :vals)))) + [] + (:vals compiled-transformation)))] + (cart (map specialize compiled-transformation)))) + +(defn compile-transforms [transformations] + (map (fn [transformation] + (assoc transformation :fn (case (:lang transformation) + :js (js/script->transform-fn-vals (:script transformation)) + :sci (sci/sci-compile (:script transformation)) + (throw (Exception. (format "Language code '%s' is not supported" + (:lang transformation))))))) + transformations)) + +(defn apply-all [query-map transform-variations] + (map (fn [variation] + (let [fns-to-apply (map (fn [with-fn] [(:lang with-fn) (:fn with-fn) (:value with-fn)]) variation)] + {:variation variation + :request (reduce (fn [acc [lang afn aval]] + (case lang + :sci (afn acc aval) + ; first param to js transformation should be json string + ; js transformation always returns a string + ; we need to decode the response string + :js (json/decode (afn (json/encode acc) aval)) + (throw (Exception. (format "Language code '%s' is not supported" lang))))) + query-map + fns-to-apply)})) transform-variations)) + +(defn generate-queries + "Creates a cartesian product of all the variations of the provided vals and scripts. + Applies these transformations on the query hashmap. + Each transformation is applied in order on the query. + Returns a list of maps with :variation and :request keys: + - :variation is for debugging purposes + - :request is a hashmap that should be a valid ES query. + Query is expected be a decoded hashmap. + Transformations are hashmaps as per example." + [query transformations] + (map (fn [generated-query] + (update generated-query :variation (fn [variation] + (map #(select-keys % [:id :value]) variation)))) + (apply-all query (-> transformations + (compile-transforms) + (create-transformation-variations))))) + +(comment + (replay.transform.impact/generate-queries + {:query {:match_all {}}} + [{:lang :sci :id :a :script "(fn [query value] (assoc query :a value))" :vals [1 2 3]} + {:lang :sci :id :b :script "(fn [query value] (assoc query :b value))" :vals [10 20 30]} + {:lang :sci :id :c :script "(fn [query value] (assoc query :c value))" :vals [100 200 300]} + {:lang :js :id :d :script "(query, value) => { query['d'] = value; return query; }" :vals ["a" "b" "c"]}])) diff --git a/src/replay/transform/query.clj b/src/replay/transform/query.clj new file mode 100644 index 0000000..c7ff900 --- /dev/null +++ b/src/replay/transform/query.clj @@ -0,0 +1,24 @@ +(ns replay.transform.query + (:require [polyglot.js :as js] + [polyglot.sci :as sci])) + +(defn compile-transform [{:keys [lang script]}] + (case (keyword lang) + :sci (sci/script->transform-fn script) + :js (js/script->transform-fn script) + (throw (Exception. (format "No such language supported: '%s'" (name lang)))))) + +(defn transform-fn [transforms] + (let [tf-fn (apply comp (map compile-transform (reverse transforms)))] + (fn [query] (tf-fn query)))) + +(comment + (time + (let [data "{}" + tfs [{:lang :js + :script "(request) => request"} + {:lang :sci + :script "(fn [q] (assoc q :_explain true))"}] + tf (transform-fn tfs)] + (dotimes [i 1000] + (tf data))))) diff --git a/src/replay/transform/uri.clj b/src/replay/transform/uri.clj new file mode 100644 index 0000000..15327f0 --- /dev/null +++ b/src/replay/transform/uri.clj @@ -0,0 +1,22 @@ +(ns replay.transform.uri + (:require [clojure.string :as str])) + +(defn transform-uri [uri transforms] + (reduce (fn [uri {:keys [match replacement]}] + (str/replace uri (re-pattern match) replacement)) uri transforms)) + +(comment + (replay.transform.uri/transform-uri + "/foo/bar/baz" + [{:match "bar" + :replacement "moo"}])) + +(defn construct-endpoint [doc replay-conf] + (or (:uri replay-conf) + (let [uri (get doc (keyword (:uri_attr replay-conf)))] + (transform-uri uri (:uri-transforms replay-conf))))) + +(defn transform + "Applies string transformations in order on the uri." + [{:keys [uri transforms] :as request}] + (assoc request :transformed-uri (transform-uri uri transforms))) diff --git a/src/server.clj b/src/server.clj new file mode 100644 index 0000000..2b4260c --- /dev/null +++ b/src/server.clj @@ -0,0 +1,20 @@ +(ns server + (:require [clojure.tools.logging :as log] + [org.httpkit.server :refer [run-server]] + [server.core :refer [app]])) + +(def default-http-server-config + {:ip "0.0.0.0" + :port 8090 + :max-body Integer/MAX_VALUE}) + +(defn start + "Start the HTTP server with provided settings." + [config] + (let [http-server-config (merge default-http-server-config config)] + (log/infof "Starting an HTTP server with config: '%s'" http-server-config) + (run-server #'app http-server-config))) + +(defn -main [] + (log/infof "Starting the server") + (start {})) diff --git a/src/server/core.clj b/src/server/core.clj new file mode 100644 index 0000000..d4005ab --- /dev/null +++ b/src/server/core.clj @@ -0,0 +1,37 @@ +(ns server.core + (:require [reitit.ring :as ring] + [reitit.coercion.spec] + [reitit.dev.pretty :as pretty] + [reitit.swagger :as swagger] + [muuntaja.core :as m] + [reitit.ring.coercion :as coercion] + [server.ops-routes :as ops-routes])) + +(def app + (ring/ring-handler + (ring/router + [ops-routes/ops] + {;;:reitit.middleware/transform dev/print-request-diffs ;; pretty diffs + ;;:validate spec/validate ;; enable spec validation for route data + ;;:reitit.spec/wrap spell/closed ;; strict top-level validation + :exception pretty/exception + :data {:coercion reitit.coercion.spec/coercion + :muuntaja m/instance + :middleware [;; swagger feature + swagger/swagger-feature + ;; query-params & form-params + ;parameters/parameters-middleware + ;;; content-negotiation + ;muuntaja/format-negotiate-middleware + ;;; encoding response body + ;muuntaja/format-response-middleware + ;;; exception handling + ;exception/exception-middleware + ;;; decoding request body + ;muuntaja/format-request-middleware + ;;; coercing response bodys + ;coercion/coerce-response-middleware + ;; coercing request parameters + coercion/coerce-request-middleware]}}) + (ring/routes + (ring/create-default-handler)))) diff --git a/src/server/ops_routes.clj b/src/server/ops_routes.clj new file mode 100644 index 0000000..0db8b5c --- /dev/null +++ b/src/server/ops_routes.clj @@ -0,0 +1,50 @@ +(ns server.ops-routes + (:require [core.json :as json] + [ops :as my-ops])) + +(defn operation->routes [{:keys [name docs defaults handler-fn]}] + (let [path-root-handler {:summary (format "fetches a docstring of the %s operation" name) + :handler (fn [_] + ; return available subopts + {:status 200 + :body (json/encode {:docs docs} {:pretty true})})}] + [(format "/%s" name) + ["" path-root-handler] + ["/" path-root-handler] + ["/execute" + {:post {:summary (format "execute %s operation" name) + :responses {200 {:body {:total int?}}} + :handler (fn [req] + {:status 200 + :body (let [decoded-body (json/decode (:body req))] + (handler-fn decoded-body))})}}] + ["/docs" + {:get {:summary (format "fetches a docstring of the %s operation" name) + :handler (fn [_] + {:status 200 + :headers {"Content-Type" "application/json"} + :body (json/encode {:docs docs} {:pretty true})})}}] + ["/source" + {:get {:summary "fetches source of the operation" + :handler (fn [_] + {:status 200 + :headers {"Content-Type" "application/json"} + :body (json/encode {:source nil} {:pretty true})})}}] + ["/defaults" + {:get {:summary (format "fetches default options of the %s operation" name) + :handler (fn [_] + {:status 200 + :headers {"Content-Type" "application/json"} + :body (json/encode {:defaults defaults} {:pretty true})})}}]])) + +(def ops + (vec (concat ["/ops"] + (map (fn [path] + [path {:summary "List of available operation" + :handler (fn [_] + {:status 200 + :headers {"Content-Type" "application/json"} + :body (json/encode {:ops (map (fn [op] (:name op)) + my-ops/operations)})})}]) + ["" "/"]) + (map (fn [operation] (operation->routes operation)) my-ops/operations)))) diff --git a/src/sink.clj b/src/sink.clj new file mode 100644 index 0000000..db83ee9 --- /dev/null +++ b/src/sink.clj @@ -0,0 +1,15 @@ +(ns sink + (:require [clojure.tools.logging :as log] + [sink.elasticsearch :as elasticsearch] + [sink.file :as file] + [sink.kafka :as kafka])) + +(defrecord KafkaRecord [key value headers]) + +(defn store! [records opts] + (let [sink-implementation-id (keyword (get-in opts [:sink :implementation]))] + (case sink-implementation-id + :kafka (kafka/store! records opts) + :elasticsearch (elasticsearch/store! records opts) + :file (file/store! records opts) + (log/errorf "No such sink '%s' implementation!" sink-implementation-id)))) diff --git a/src/sink/elasticsearch.clj b/src/sink/elasticsearch.clj new file mode 100644 index 0000000..5f42f4d --- /dev/null +++ b/src/sink/elasticsearch.clj @@ -0,0 +1,7 @@ +(ns sink.elasticsearch + (:require [clojure.tools.logging :as log] + [sink.elasticsearch.index :as elasticsearch])) + +(defn store! [records opts] + (log/infof "Sinking in Elasticsearch") + (elasticsearch/store! records (:sink opts))) diff --git a/src/sink/elasticsearch/index.clj b/src/sink/elasticsearch/index.clj new file mode 100644 index 0000000..f615549 --- /dev/null +++ b/src/sink/elasticsearch/index.clj @@ -0,0 +1,154 @@ +(ns sink.elasticsearch.index + (:require [clojure.core.reducers :as r] + [clojure.tools.logging :as log] + [core.ilm :as ilm] + [core.json :as json]) + (:import (io.confluent.connect.elasticsearch.jest JestElasticsearchClient) + (io.confluent.connect.elasticsearch BulkIndexingClient IndexableRecord Key) + (io.confluent.connect.elasticsearch.bulk BulkProcessor BulkProcessor$BehaviorOnMalformedDoc) + (org.apache.kafka.connect.sink SinkRecord) + (org.apache.kafka.common.utils SystemTime) + (java.util Map))) + +(def default-opts + "Documentation of these params can be found at: + https://docs.confluent.io/current/connect/kafka-connect-elasticsearch/configuration_options.html" + {:auto.create.indices.at.start true + :batch.size (int 10000) + :behavior.on.malformed.documents "warn" + :behavior.on.null.values "delete" + :compact.map.entries true + :connection.compression false + :connection.password nil + :connection.timeout.ms (int 60000) + :connection.url "http://localhost:9200" + :connection.username nil + :drop.invalid.message false + :elastic.https.ssl.cipher.suites nil + :elastic.https.ssl.enabled.protocols "TLSv1.2" + :elastic.https.ssl.endpoint.identification.algorithm "https" + :elastic.https.ssl.key.password nil + :elastic.https.ssl.keymanager.algorithm "SunX509" + :elastic.https.ssl.keystore.location nil + :elastic.https.ssl.keystore.password nil + :elastic.https.ssl.keystore.type "JKS" + :elastic.https.ssl.protocol "TLSv1.2" + :elastic.https.ssl.provider nil + :elastic.https.ssl.secure.random.implementation nil + :elastic.https.ssl.trustmanager.algorithm "PKIX" + :elastic.https.ssl.truststore.location nil + :elastic.https.ssl.truststore.password nil + :elastic.https.ssl.truststore.type "JKS" + :elastic.security.protocol "PLAINTEXT" + :flush.timeout.ms (int 60000) + :key.ignore false + :linger.ms (int 1000) + :max.buffered.records (int 100000) + :max.in.flight.requests (int 16) + :max.retries (int 10) + :read.timeout.ms (int 60000) + :retry.backoff.ms (int 2000) + :schema.ignore "true" + :topic.index.map nil + :topic.key.ignore nil + :topic.schema.ignore nil + :type.name "_doc" + :write.method "insert"}) + +(defn get-conf-val [key conf defaults] + (first + (remove nil? (list (get conf (name key)) + (get conf key) + (get defaults (name key)) + (get defaults key))))) + +(defn get-int-conf-val [key conf defaults] + (Integer/parseInt (str (get-conf-val key conf defaults)))) + +(defn ^Map stringify-conf [conf] + (reduce (fn [acc [k v]] (assoc acc (name k) (when-not (nil? v) (str v)))) {} conf)) + +(defn ^BulkProcessor writer [sink-opts] + (let [client (JestElasticsearchClient. (stringify-conf (merge default-opts sink-opts)))] + (BulkProcessor. + (SystemTime.) + (BulkIndexingClient. client) + (get-int-conf-val :max.buffered.records sink-opts default-opts) + (get-int-conf-val :max.in.flight.requests sink-opts default-opts) + (get-int-conf-val :batch.size sink-opts default-opts) + (get-int-conf-val :linger.ms sink-opts default-opts) + (get-int-conf-val :max.retries sink-opts default-opts) + (get-int-conf-val :retry.backoff.ms sink-opts default-opts) + (BulkProcessor$BehaviorOnMalformedDoc/forValue + (get-conf-val :behavior.on.malformed.documents sink-opts default-opts)) + nil))) + +(defn record->sink-record [index-name record] + (SinkRecord. + ^String index-name + 0 + nil + (get record :_id) + nil + (get record :_source) + 0)) + +(defn store! + "Indexes records to ES index. + For slightly faster map access construct + Params: + :connection.url - ES Host, default http://localhost:9200 + :dest-host - same as :connection.url, lower precedence + :dest.index - index name, no default, required + :dest-index - same as :dest.index lower precedence + :already.encoded - doc is already a JSON string, default false" + [records sink-opts] + (let [^BulkProcessor bulk-processor (writer sink-opts) + ^String type-name (get-conf-val :type.name sink-opts default-opts) + ^Integer flush-timeout (get-conf-val :flush.timeout.ms sink-opts default-opts) + dest-host (or (get-conf-val :connection.url sink-opts default-opts) + (get-conf-val :dest-host sink-opts {})) + ^String index-name (or (get-conf-val :dest.index sink-opts {}) + (get-conf-val :dest-index sink-opts {})) + already-encoded? (get-conf-val :already.encoded sink-opts {})] + (when-not (ilm/index-exists? dest-host index-name) + (log/infof "Created index: %s" (ilm/create-index! dest-host index-name))) + (log/infof "Disabled index refresh interval: %s" (ilm/set-refresh-interval! dest-host index-name "-1")) + (.start bulk-processor) + (r/fold + (fn [& [_ record]] + (when record + (.add bulk-processor + (IndexableRecord. + (Key. index-name type-name (str (get record :_id))) + (if already-encoded? + (get record :_source) + (when-let [src (get record :_source)] + (json/encode-vanilla src))) + (System/nanoTime)) + (record->sink-record index-name record) + flush-timeout))) + records) + (.flush bulk-processor flush-timeout) + (.stop bulk-processor) + (log/infof "Enabled index refresh interval: %s" (ilm/set-refresh-interval! dest-host index-name "1s")))) + +(defrecord EsRecord [_id _source]) + +(comment + (time + ; Index 1M small maps + (store! + (doall (->> (range 1000000) + (mapv (fn [idx] (EsRecord. (str idx) {:idx idx}))))) + {:connection.url "http://localhost:9200" + :dest.index "index" + :batch.size "15000"})) + (time + ; already encoded json strings + (store! + (doall (->> (range 1000000) + (mapv (fn [idx] (EsRecord. (str idx) (str "{\"idx\":" idx "}")))))) + {:connection.url "http://localhost:9200" + :already.encoded true + :dest.index "index"}))) diff --git a/src/sink/file.clj b/src/sink/file.clj new file mode 100644 index 0000000..30f2b2a --- /dev/null +++ b/src/sink/file.clj @@ -0,0 +1,92 @@ +(ns sink.file + (:import (java.io PrintStream File) + (java.nio.file Files Paths StandardOpenOption OpenOption) + (java.nio.charset StandardCharsets))) + +(defn prepare-print-stream [^String filename] + (let [f (File. filename)] + (when-let [parent-file (.getParentFile f)] + (.mkdirs parent-file))) + (PrintStream. + (Files/newOutputStream + (Paths/get "" (into-array ^String [filename])) + (into-array OpenOption [StandardOpenOption/CREATE StandardOpenOption/APPEND])) + false + (.name StandardCharsets/UTF_8))) + +(defn ^PrintStream prepare [^String filename] + (if filename + {:stream (prepare-print-stream filename) + :close? true} + {:stream System/out + :close? false})) + +(defn write-to-file [filename records] + (let [{^PrintStream outputStream :stream close? :close?} (prepare filename)] + (doseq [r records] + (.println outputStream (:value r))) + (.flush outputStream) + (when close? + (.close outputStream)))) + +(defn filename->path-and-filename [^String filename] + (let [f (File. filename)] + {:path (.toString (.getParentFile f)) + :file-name (.getName f)})) + +(defn partition-and-write-to-file [partition-size filename records] + (let [{:keys [path file-name]} (when filename (filename->path-and-filename filename))] + (loop [parts (partition-all partition-size records) + i 0] + (let [start-index (* partition-size i) + end-index (min (+ (* partition-size i) (dec (count (first parts)))) + (- (* partition-size (inc i)) 1)) + partition-filename (when filename + (format "%s/%s_%s_%s" path start-index end-index file-name))] + (if (seq (first parts)) + (write-to-file partition-filename (first parts))) + (if (seq (rest parts)) + (recur (rest parts) (inc i))))))) + +(defn store! + "When filename is not provided then output is System.out" + [records opts] + (let [filename (-> opts :sink :filename)] + (if-let [partition-size (-> opts :sink :partition-size)] + (partition-and-write-to-file partition-size filename records) + (write-to-file filename records)))) + +(comment + (sink.file/store! + [{:value "line1"} + {:value "line2"}] + {:sink {}}) + + (sink.file/store! + [{:value "line1"} + {:value "line2"}] + {:sink {:filename "test.ndjson"}}) + + (sink.file/store! + [{:value "line1"} + {:value "line2"}] + {:sink {:filename "foo/test.ndjson"}}) + + (sink.file/store! + (map (fn [val] {:value val}) (range 17)) + {:sink {:filename "test.ndjson" + :partition-size 10}}) + + (sink.file/store! + (map (fn [val] {:value val}) (range 17)) + {:sink {:filename "foo/test.ndjson" + :partition-size 10}}) + + (sink.file/store! + (map (fn [val] {:value val}) (range 17)) + {:sink {:filename "/tmp/test.ndjson" + :partition-size 10}}) + + (sink.file/store! + (map (fn [val] {:value val}) (range 17)) + {:sink {:partition-size 10}})) diff --git a/src/sink/kafka.clj b/src/sink/kafka.clj new file mode 100644 index 0000000..0bee632 --- /dev/null +++ b/src/sink/kafka.clj @@ -0,0 +1,81 @@ +(ns sink.kafka + (:require [clojure.tools.logging :as log] + [core.json :as json] + [core.properties :as properties]) + (:import (java.time Duration) + (java.util UUID) + (org.apache.kafka.clients.producer KafkaProducer ProducerRecord ProducerConfig Callback) + (org.apache.kafka.common.serialization StringSerializer) + (org.apache.kafka.common.header.internals RecordHeader))) + +(def not-producer-keys [:topic :encode-value? :implementation]) + +(defn kafka-producer + "Supported options are http://kafka.apache.org/documentation.html#producerconfigs + Keys in the opts can be either keywords or strings." + [opts] + (let [opts (apply dissoc opts not-producer-keys)] + (KafkaProducer. + (doto (properties/opts->properties opts) + ; Set the required defaults properties for the kafka producer + (.put ProducerConfig/CLIENT_ID_CONFIG + (or (get opts :client.id) (get opts "client.id") + (str "ESToolsKafkaProducer-" (UUID/randomUUID)))) + (.put ProducerConfig/KEY_SERIALIZER_CLASS_CONFIG (.getName StringSerializer)) + (.put ProducerConfig/VALUE_SERIALIZER_CLASS_CONFIG (.getName StringSerializer)))))) + +(defn map->headers [r] + (map (fn [[k v]] (RecordHeader. (name k) (.getBytes (str v)))) + (remove (fn [[_ v]] (empty? v)) r))) + +(defn store! + "Records is a list of maps {:keys String :record Map :headers Map}. + It should be possible to JSON encode the :record + opts must contain :sink which is a map with the Kafka Consumer opts either + with string of keyword keys. Also, some additional keys: + :topic - name of the topic to which to store the records + :encode-value? - whether to JSON encode value, default true" + [records opts] + (properties/opts-valid? :topic (:sink opts)) + (let [sink-opts (:sink opts) + ^KafkaProducer producer (kafka-producer sink-opts) + topic (:topic sink-opts)] + (doseq [r records] + (try + (.send producer + (ProducerRecord. topic nil nil + (:key r) + (if (false? (:encode-value? sink-opts)) + (:value r) + (json/encode (:value r))) + (map->headers (:headers r))) + (reify Callback + (onCompletion [this metadata exception] + (when exception + (println metadata exception))))) + (catch Exception e + (log/errorf "Failed to store record '%s' in kafka because '%s'" r e)))) + (log/infof "Flushing records") + (.flush producer) + (log/infof "Flushed") + (.close producer (Duration/ofSeconds 2)))) + +(comment + ; without the key + (store! + [{:value {:test "test-1"}}] + {:sink {:topic "sink-test" + :bootstrap.servers "127.0.0.1:9092" + :linger.ms 0}}) + ; with the key + (store! + [{:key "test" :value {:test "test"}}] + {:sink {:topic "sink-test" + :bootstrap.servers "127.0.0.1:9092"}}) + ; with headers + (store! + [{:key "test" + :value {:test "test"} + :headers {:test-header "test-header"}}] + {:sink {:topic "sink-test" + :bootstrap.servers "127.0.0.1:9092"}})) diff --git a/src/source.clj b/src/source.clj new file mode 100644 index 0000000..299c3b2 --- /dev/null +++ b/src/source.clj @@ -0,0 +1,13 @@ +(ns source + (:require [clojure.tools.logging :as log] + [source.elasticsearch :as elasticsearch] + [source.kafka :as kafka] + [source.krp :as krp])) + +(defn fetch! [opts] + (let [source-implementation-id (keyword (get-in opts [:source :implementation]))] + (case source-implementation-id + :elasticsearch (elasticsearch/fetch opts) + :kafka (kafka/fetch opts) + :krp (krp/fetch opts) + (log/errorf "No such source implementation '%s'" source-implementation-id)))) diff --git a/src/source/elasticsearch.clj b/src/source/elasticsearch.clj new file mode 100644 index 0000000..ad4da8f --- /dev/null +++ b/src/source/elasticsearch.clj @@ -0,0 +1,19 @@ +(ns source.elasticsearch + (:require [source.elasticsearch.records :as records] + [source.elasticsearch.search-after-with-pit :as pit])) + +(defn fetch [opts] + (let [strategy (-> opts :source :strategy)] + (case strategy + :search-after-with-pit (pit/fetch opts) + (records/fetch opts)))) + +(comment + (fetch {:max_docs 10 + :source {:remote {:host "http://localhost:9200"} + :index "source-index-name"}}) + + (fetch {:max_docs 10 + :source {:remote {:host "http://localhost:9200"} + :index "source-index-name" + :strategy :search-after-with-pit}})) diff --git a/src/source/elasticsearch/records.clj b/src/source/elasticsearch/records.clj new file mode 100644 index 0000000..3a52737 --- /dev/null +++ b/src/source/elasticsearch/records.clj @@ -0,0 +1,42 @@ +(ns source.elasticsearch.records + (:require [scroll :as scroll])) + +(defn conf->scroll + [conf default-conf] + (let [source-host (or (-> conf :source :remote :host) + (-> default-conf :source :remote :host)) + source-index (or (-> conf :source :index) + (-> default-conf :source :index)) + query (or (-> conf :source :query) + (-> default-conf :source :query)) + scroll-keep-context (or (-> conf :source :remote :socket_timeout) + (-> default-conf :source :remote :socket_timeout) + "1m") + keywordize? (first + (drop-while + nil? + (list (-> conf :source :keywordize?) + (-> default-conf :source :keywordize?) + true))) + strategy (or (-> conf :source :strategy) + (-> default-conf :source :strategy) :search-after)] + {:es-host source-host + :index-name source-index + :query query + :opts {:keep-context scroll-keep-context + :keywordize? keywordize? + :strategy strategy}})) + +(defn fetch + ([conf] (fetch conf {})) + ([conf default-conf] + (let [max-docs (-> conf :max_docs) + es-records (scroll/hits (conf->scroll conf default-conf))] + (if max-docs (take max-docs es-records) es-records)))) + +(comment + (source.elasticsearch.records/fetch + {:max_docs 10 + :source {:remote {:host "http://localhost:9200"} + :index "index_name" + :query {:size 1}}})) diff --git a/src/source/elasticsearch/search_after_with_pit.clj b/src/source/elasticsearch/search_after_with_pit.clj new file mode 100644 index 0000000..7d4d589 --- /dev/null +++ b/src/source/elasticsearch/search_after_with_pit.clj @@ -0,0 +1,43 @@ +(ns source.elasticsearch.search-after-with-pit + (:require [clojure.tools.logging :as log] + [scroll.pit :as pit] + [scroll :as scroll])) + +(defn records [es-host index-name query max-docs keep-alive] + (let [opts {:keep-alive keep-alive} + pit (pit/init es-host index-name opts) + ; mutable state is needed because PIT ID might change between calls + latest-pit-id (atom (:id pit)) + pit-with-keep-alive (assoc pit :keep_alive keep-alive)] + (lazy-cat + (let [hits (scroll/hits + {:es-host es-host + :index-name index-name + :query (assoc query :pit pit-with-keep-alive) + :opts {:strategy :search-after + ; expects an atom + ; the contents of an atom will be a string with PIT ID + :latest-pit-id latest-pit-id}})] + (if max-docs (take max-docs hits) hits)) + ; last element of the lazy-sequence is the output of `do` macro + ; and inside the `do` we terminate the PIT and return nil + ; that last nil will not be in the sequence because `lazy-cat` terminates if nil + (do + (log/debugf "PIT terminated with: %s" + (pit/terminate es-host {:id @latest-pit-id})) + nil)))) + +; TODO: support other options such as keywordize? +(defn fetch [opts] + (let [max-docs (-> opts :max_docs) + es-host (or (-> opts :source :remote :host) "http://localhost:9200") + index-name (or (-> opts :source :remote :index) "*") + query (or (-> opts :source :query) {:query {:match_all {}}}) + keep-alive (or (-> opts :source :remote :connect_timeout) "30s")] + (records es-host index-name query max-docs keep-alive))) + +(comment + (source.elasticsearch.search-after-with-pit/fetch + {:max_docs 12 + :source {:remote {:host "http://localhost:9200"} + :index "index_name"}})) diff --git a/src/source/kafka.clj b/src/source/kafka.clj new file mode 100644 index 0000000..81effa4 --- /dev/null +++ b/src/source/kafka.clj @@ -0,0 +1,258 @@ +(ns source.kafka + (:require [clojure.tools.logging :as log] + [core.json :as json] + [core.properties :as properties]) + (:import (java.time Duration Instant) + (java.util.regex Pattern) + (org.apache.kafka.common.header Header) + (org.apache.kafka.common.serialization StringDeserializer) + (org.apache.kafka.clients.consumer KafkaConsumer ConsumerConfig + ConsumerRecords ConsumerRecord OffsetAndTimestamp) + (org.apache.kafka.common PartitionInfo TopicPartition) + (org.apache.kafka.common.config ConfigDef) + (java.util Map))) + +(def definitions + [{:name :implementation + :default nil + :documentation ""} + {:name :max_docs + :default nil + :documentation ""} + {:name :topic + :documentation ""} + {:name :impatient? + :documentation ""} + {:name :retry-count + :default 2 + :documentation ""} + {:name :keywordize? + :default true + :documentation ""} + {:name :decode-value? + :default true + :documentation ""} + {:name :timestamp + :documentation ""} + {:name :timestamp-to + :documentation ""} + {:name :offset + :documentation ""} + {:name :offset-to + :documentation ""} + {:name :poll.timeout.ms + :default 2000 + :documentation ""} + {:name :seek.poll.timeout.ms + :default 0 + :documentation ""}]) + +(def source-options-definitions + (reduce + (fn [acc [k v]] (assoc acc k (first v))) + {} (group-by :name definitions))) + +(defn default-for [name-key] + (-> source.kafka/source-options-definitions name-key :default)) + +(defn val-for [source-opts key] + (first + (remove nil? (list (get source-opts key) + (get source-opts (name key)) + (default-for key))))) + +(def kafka-consumer-config-keys (.names ^ConfigDef (ConsumerConfig/configDef))) +(def default-kafka-consumer-opts (into {} (.defaultValues ^ConfigDef (ConsumerConfig/configDef)))) +(def application-defaults + {ConsumerConfig/KEY_DESERIALIZER_CLASS_CONFIG (.getName StringDeserializer) + ConsumerConfig/VALUE_DESERIALIZER_CLASS_CONFIG (.getName StringDeserializer)}) + +(defn kafka-consumer-config-with-defaults [opts] + (let [str-opts (into {} (map (fn [[k v]] [(name k) v]) opts)) + config (merge default-kafka-consumer-opts + application-defaults + (select-keys str-opts kafka-consumer-config-keys))] + (if (get config ConsumerConfig/GROUP_ID_CONFIG) + config + (dissoc config ConsumerConfig/ENABLE_AUTO_COMMIT_CONFIG)))) + +(defn kafka-consumer [opts] + (KafkaConsumer. ^Map (kafka-consumer-config-with-defaults opts))) + +(defn reset-offsets [^KafkaConsumer consumer opts] + (let [poll-timeout (val-for opts :seek.poll.timeout.ms) + _ (.poll consumer (Duration/ofMillis poll-timeout)) + timestamp (val-for opts :timestamp) + offset-min (val-for opts :offset)] + (if (and (nil? timestamp) (nil? offset-min)) + (.seekToBeginning consumer (.assignment consumer)) + (let [topic-partitions (.assignment consumer) + offsets-for-times (if timestamp + (let [timestamp-long (.toEpochMilli (Instant/parse timestamp))] + (->> (.assignment consumer) + (reduce (fn [acc ^TopicPartition topic-partition] + (assoc acc topic-partition timestamp-long)) {}) + (.offsetsForTimes consumer))) + {}) + end-offsets (.endOffsets consumer (.assignment consumer))] + (doseq [^TopicPartition topic-partition topic-partitions] + (let [^OffsetAndTimestamp offset-and-timestamp (get offsets-for-times topic-partition) + offset-by-timestamp (when offset-and-timestamp + (Long/valueOf (.offset offset-and-timestamp))) + ^Long end-offset (get end-offsets topic-partition) + ^Long max-of-timestamp-and-offset + (when-let [offsets (seq (remove nil? [offset-by-timestamp offset-min]))] + (apply max offsets)) + real-offset (if max-of-timestamp-and-offset max-of-timestamp-and-offset end-offset)] + (log/tracef "offset='%s; timestamp='%s'; offset-by-timestamp='%s' end-offset='%s'; final-offset='%s'" + offset-min timestamp offset-by-timestamp end-offset real-offset) + (.seek consumer topic-partition real-offset))))))) + +; TODO: come up with a better option to close consumer. +(def state (atom {:consumer nil})) + +(defn assign-consumer [^KafkaConsumer consumer ^Pattern topic-name-pattern] + (let [topic-partitions (->> (.listTopics consumer) + (filter (fn [[^String k]] (re-matches topic-name-pattern k))) + (vals) + (apply concat) + (map (fn [^PartitionInfo partition-info] + (TopicPartition. (.topic partition-info) + (.partition partition-info)))))] + (log/infof "Assigning Kafka consumer to '%s' topic(s) partitions: %s" + topic-name-pattern (count topic-partitions)) + (.assign consumer topic-partitions))) + +(defn group-id-provided? [opts] + (or (get opts :group.id) (get opts "group.id"))) + +(defn init-consumer [opts] + (when-let [^KafkaConsumer consumer (:consumer (deref state))] + (.close consumer)) + (let [^KafkaConsumer consumer (kafka-consumer opts) + ^String topic (:topic opts) + topic-name-pattern (re-pattern topic)] + (swap! state assoc :consumer consumer) + (if (group-id-provided? opts) + (.subscribe consumer topic-name-pattern) + (assign-consumer consumer topic-name-pattern)) + (reset-offsets consumer opts) + (log/infof "Initialized consumer %s." opts) + consumer)) + +(defn lazy-records [^KafkaConsumer consumer poll-timeout-ms] + (let [^ConsumerRecords recs (.poll consumer ^Duration (Duration/ofMillis poll-timeout-ms))] + (.count recs) + (lazy-cat recs (lazy-records consumer poll-timeout-ms)))) + +(defn impatient-lazy-records [^KafkaConsumer consumer retries poll-timeout-ms] + (let [^ConsumerRecords recs (.poll consumer ^Duration (Duration/ofMillis poll-timeout-ms))] + (lazy-cat recs (if (and (zero? (.count recs)) (zero? retries)) + (do + (log/infof "Closing consumer: '%s'" consumer) + (.close consumer Duration/ZERO)) + (impatient-lazy-records consumer + (if (and (zero? (.count recs)) (pos-int? retries)) + (dec retries) + 3) + poll-timeout-ms))))) + +(defrecord Record [topic headers key timestamp offset value partition]) + +(defn fetch + "Lazily fetches records from Kafka. A record is a map (record) with keys: + [:topic :headers :key :timestamp :value]. + Header keys are keywordized. (TODO: param whether to keywordize) + Params is a map with two expected keys: [:max_docs :source] + :max_docs - max amount of docs to fetch. If max_docs not provided, sequence is infinite. + :source - Kafka Consumer opts either with string of keyword keys. + Also, some additional keys: + :topic - specifies which topic to consume (used as a regex pattern) (if topic does + not exist, then IllegalStateException is thrown) + :group.id - when provided Kafka consumer will join consumer group, otherwise partitions + are assigned without joining the consumer group (much faster and less problems). + :timestamp - oldest Kafka records to consume, ISO string, e.g '2007-12-03T10:15:30.00Z' + :timestamp-to - the latest records to consume, ISO string, e.g '2007-12-03T10:15:30.00Z' + :offset - kafka offset for all the partitions, default 0. + :seek.poll.timeout.ms - seek poll timeout in ms, for initialization, default 0 (when :group.id + is provided you andyou want to reset offsets most likely you want to provide this + param with value of at least 2000). + :poll.timeout.ms - poll timeout in ms, default 2000. + :impatient? - specifies if the Kafka Consumer should terminate after :retry-count + :retry-count - how many retries impatient lazy consumer should try to fetch records + :decode-value? - whether to decode value to a map + :keywordize? - whether keys should be keywords, default true." + [{source-opts :source :as opts}] + (properties/opts-valid? :topic source-opts) + (let [max-docs (-> opts :max_docs) + ^KafkaConsumer consumer (init-consumer source-opts) + poll-timeout-ms (val-for source-opts :poll.timeout.ms) + timestamp-to (when-let [t (val-for source-opts :timestamp-to)] + (.toEpochMilli (Instant/parse t))) + offset-to (val-for source-opts :offset-to) + records (if (:impatient? source-opts) + (impatient-lazy-records consumer (val-for source-opts :retry-count) poll-timeout-ms) + (do + (.addShutdownHook (Runtime/getRuntime) + (Thread. ^Runnable (fn [] + (log/infof "Starting exit.") + (.close consumer)))) + (lazy-records consumer poll-timeout-ms))) + keywordize? (val-for source-opts :keywordize?) + decode-value? (val-for source-opts :decode-value?)] + (->> (if max-docs (take max-docs records) records) + (filter (fn [^ConsumerRecord record] (if timestamp-to + (< (.timestamp record) timestamp-to) + true))) + (filter (fn [^ConsumerRecord record] (if offset-to + (< (.offset record) offset-to) + true))) + (map (fn [^ConsumerRecord record] + (->Record (.topic record) + (reduce (fn [acc ^Header h] + (assoc acc (if keywordize? + (keyword (.key h)) + (.key h)) (String. (.value h)))) + {} (.headers record)) + (.key record) + (.timestamp record) + (.offset record) + (if decode-value? + (json/decode (.value record) keywordize?) + (.value record)) + (.partition record))))))) + +(comment + (fetch + {:max_docs 1 + :source {:topic "test-topic" + :bootstrap.servers "127.0.0.1:9092"}}) + + (fetch + {:max_docs 1 + :source {:topic "test-topic" + :bootstrap.servers "127.0.0.1:9092" + :impatient? true + :retry-count 3}}) + + (fetch + {:max_docs 1 + :source {:topic "test-topic" + :bootstrap.servers "127.0.0.1:9092" + :impatient? true + :retry-count 2 + :offset 0 + :seek.poll.timeout.ms 2000 + :group.id "group.id"}}) + + (fetch + {:max_docs 10 + :source {:topic "test-topic" + :bootstrap.servers "127.0.0.1:9092" + :impatient? true + :retry-count 3 + :decode-value? false + :offset 410 + :offset-to 412 + :timestamp "2020-04-27T07:53:35.998907Z" + :timestamp-to "2020-04-27T07:54:35.998907Z"}})) diff --git a/src/source/krp.clj b/src/source/krp.clj new file mode 100644 index 0000000..f86fc16 --- /dev/null +++ b/src/source/krp.clj @@ -0,0 +1,183 @@ +(ns source.krp + (:require [clojure.tools.logging :as log] + [org.httpkit.client :as http] + [core.json :as json]) + (:import (java.util Base64))) + +; TODO: support topic pattern +(def defaults + {:connection.url "http://localhost:8082" + :topic "keywords-test-topic" + :group.id "krp-group-id" + :consumer.name "krp-instance" + :format "binary" + :timeout 2000 + :max.bytes 30000 + :offset 0 + :partitions nil + :auto.offset.reset "earliest" + :consumer.request.timeout.ms 5000 + :auto.commit.enable "true" + :delete.consumer.instance true}) + +(defn opts->options [opts] + {:connection-url (or (:connection.url opts) (:connection.url defaults)) + :group-id (or (:group.id opts) (:group.id defaults)) + :consumer-name (or (:consumer.name opts) (:consumer.name defaults)) + :delete-consumer (if (false? (:delete.consumer.instance opts)) + false + (:delete.consumer.instance defaults)) + :format (or (:format opts) (:format defaults)) + :topic (or (:topic opts) (:topic defaults)) + :timeout (or (:timeout opts) (:timeout defaults)) + :max-bytes (or (:max.bytes opts) (:max.bytes defaults)) + :offset (or (:offset opts) (:offset defaults)) + :partitions (or (:partitions opts) (:partitions defaults)) + :auto-commit-enable (or (:auto.commit.enable opts) (:auto.commit.enable defaults)) + :auto-offset-reset (or (:auto.offset.reset opts) (:auto.offset.reset defaults)) + :consumer-request-timeout-ms (or (:consumer.request.timeout.ms opts) + (:consumer.request.timeout.ms defaults))}) + +(defn create-consumer-instance + [{:keys [connection-url group-id consumer-name format + consumer-request-timeout-ms auto-commit-enable auto-offset-reset]}] + @(http/request + {:method :post + :headers {"Accept" "application/vnd.kafka.v2+json" + "Content-Type" "application/vnd.kafka.v2+json"} + :url (clojure.core/format "%s/consumers/%s" connection-url group-id) + :body (json/encode + {:name consumer-name + :format format + :auto.offset.reset auto-offset-reset + :auto.commit.enable auto-commit-enable + :consumer.request.timeout.ms consumer-request-timeout-ms})} + (fn [resp] (-> resp)))) + +(defn delete-consumer-instance [{:keys [connection-url group-id consumer-name]}] + @(http/request + {:method :delete + :headers {"Accept" "application/vnd.kafka.v2+json"} + :url (format "%s/consumers/%s/instances/%s" connection-url group-id consumer-name)} + (fn [resp] resp))) + +(defn subscribe-to-topic [{:keys [connection-url group-id consumer-name topic]}] + @(http/request + {:method :post + :url (format "%s/consumers/%s/instances/%s/subscription" + connection-url group-id consumer-name) + :headers {"Content-Type" "application/vnd.kafka.v2+json"} + :body (json/encode {:topics [topic]})} + (fn [resp] resp))) + +(defn get-partition-count [{:keys [connection-url topic]}] + @(http/request {:method :get + :url (format "%s/topics/%s" connection-url topic) + :headers {"Content-Type" "application/vnd.kafka.v2+json"}} + (fn [resp] (-> resp :body json/decode :partitions count)))) + +(defn set-offset [{:keys [connection-url group-id consumer-name topic offset partitions] :as opts}] + (let [topic-partition-count (get-partition-count opts) + partition-list (if (and partitions (< (apply max partitions) topic-partition-count)) + partitions + (range topic-partition-count))] + @(http/request {:method :post + :url (format "%s/consumers/%s/instances/%s/positions" + connection-url group-id consumer-name) + :headers {"Content-Type" "application/vnd.kafka.v2+json"} + :body (json/encode {:offsets (map (fn [partition] + {:topic topic + :partition partition + :offset offset}) + partition-list)})} + (fn [resp] resp)))) + +(defn base64->string [^String value] + (when value + (String. (.decode (Base64/getDecoder) value)))) + +(defn fetch-messages + [{:keys [connection-url group-id consumer-name format timeout max-bytes]}] + @(http/request + {:method :get + :headers {"Accept" (clojure.core/format "application/vnd.kafka.%s.v2+json" format) + "Content-Type" "application/vnd.kafka.v2+json"} + :url (clojure.core/format + "%s/consumers/%s/instances/%s/records?timeout=%s&max_bytes=%s" + connection-url group-id consumer-name timeout max-bytes)} + (fn [{:keys [body] :as resp}] + (if body + (let [decoded-body (json/decode body)] + (if (map? decoded-body) + (do (log/errorf "Error fetching records: %s" resp) []) + (case format + "binary" (map (fn [record] + (-> record + (update :key base64->string) + (update :value (comp json/decode base64->string)))) + (json/decode body)) + "json" (json/decode body) + (json/decode body)))) + (do (log/errorf "Empty body '%s'" resp) []))))) + +(defn lazy-records [opts] + (let [records (fetch-messages opts)] + (if (empty? records) + nil + (lazy-cat records (lazy-records opts))))) + +(defn fetch [{opts :source max-docs :max_docs}] + (let [connection-opts (opts->options opts) + consumer (create-consumer-instance connection-opts) + subscription (subscribe-to-topic connection-opts)] + (try + (log/infof "Created consumer instance '%s' and subscribed to the topic '%s'" + consumer subscription) + (when (-> opts :offset) + (log/infof "Initialized lazy consumer for offset reset: %s" + (count (fetch-messages (assoc connection-opts :max-bytes 10)))) + (log/infof "Resetting the offset of the consumer group: %s" + (set-offset connection-opts))) + (lazy-cat + (let [records (lazy-records connection-opts)] + (if max-docs + (take max-docs records) + records)) + (do + (if (false? (-> opts :delete.consumer.instance)) + (log/infof "FINISHED CONSUMPTION AND NOT DELETING CONSUMER INSTANCE") + (log/infof "Deleted the consumer instance '%s'" + (delete-consumer-instance connection-opts))) + nil)) + (catch Exception e + (log/errorf "Failed to fetch data with error: %s" e) + (log/infof "Deleted the consumer instance '%s'" + (delete-consumer-instance connection-opts)))))) + +(comment + (source.krp/fetch + {:max_docs 1 + :source {:connection.url "http://localhost:8082" + :topic "topic-name" + :group.id "krp-group_instance" + :consumer.name "krp_instance" + :offset 0 + :delete.consumer.instance false}}) + + (source.krp/fetch + {:max_docs 1 + :source {:connection.url "http://localhost:8082" + :topic "keywords-test-topic" + :group.id "krp-group-id" + :consumer.name "krp-instance" + :offset 1 + :partitions [0 1]}}) + + (source.krp/fetch + {:max_docs 3 + :source {:connection.url "http://localhost:8082" + :topic "topic-name" + :group.id "krp-group-id" + :consumer.name "krp-instance" + :timeout 100000 + :consumer.request.timeout.ms 10000}})) diff --git a/test/cli/subcommand_test.clj b/test/cli/subcommand_test.clj new file mode 100644 index 0000000..f9c1f55 --- /dev/null +++ b/test/cli/subcommand_test.clj @@ -0,0 +1,64 @@ +(ns cli.subcommand-test + (:require [clojure.test :refer :all] + [cli :as cli])) + +(def test-ops + [{:name "server" + :handler-fn identity + :docs "Sample docs" + :defaults {:ip "0.0.0.0" + :port 8090 + :max-body Integer/MAX_VALUE}}]) + +(deftest subcommand-strategy-parsing + (testing "if subcommand is detected" + (let [args ["server" "--port" "8090"] + {:keys [errors operation]} (cli/recursive-parse args test-ops)] + (is (nil? errors)) + (is (= :server (get-in operation [:name]))) + (is (= {:port 8090} + (get-in operation [:conf :options])))))) + +(deftest pre-subcommand-parsing + (testing "if subcommand is detected" + (let [args ["-o" "foo" "-f" "the-file"] + {:keys [errors operation options]} (cli/recursive-parse args test-ops)] + (is (nil? operation)) + (is (nil? errors)) + (is (= (set [:config-file :operation]) + (set (keys options))))))) + +(def subcommand-with-ops + [{:name "foo" + :handler-fn identity + :docs "Sample docs" + :defaults {:max-docs 1 + :source {:topic "my-topic" + :partitions [1 2 3]}}}]) + +(deftest subcommand-parsing-with-source + (testing "if subcommand is detected" + (let [args ["foo" "--max-docs=42" "source" "--topic=test-topic" "--partitions=1,2,3,4"] + {:keys [errors operation options]} + (cli/recursive-parse args subcommand-with-ops)] + (is (nil? errors)) + (is (empty? (set (keys options)))) + (is (= :foo (:name operation))) + (is (= {:max-docs 42 :source {:topic "test-topic" + :partitions ["1" "2" "3" "4"]}} + (:options (:conf operation)))) + (is (empty? (:errors (:conf operation)))))) + + (testing "if subcommand is detected with undefined variables" + (let [args ["foo" "--max-docs" "42" "source" "--foo" "bar"] + {:keys [errors operation]} (cli/recursive-parse args subcommand-with-ops)] + (is (nil? errors)) + (is (= :foo (:name operation))) + (is (not (empty? (:errors (:conf operation))))))) + + (testing "if subcommand is with help flag" + (let [args ["foo" "--max-docs" "42" "source" "--help"] + {:keys [errors operation] :as o} (cli/recursive-parse args subcommand-with-ops)] + (is (nil? errors)) + (is (= :foo (:name operation))) + (is (empty? (:errors (:conf operation))))))) diff --git a/test/core/async_test.clj b/test/core/async_test.clj new file mode 100644 index 0000000..ea5f1cd --- /dev/null +++ b/test/core/async_test.clj @@ -0,0 +1,16 @@ +(ns core.async-test + (:require [clojure.test :refer [deftest is]] + [clojure.core.async :as a] + [core.async :as async])) + +(deftest laziness + (let [n (rand-int 100) + input-seq (range n) + afn (fn [input-value c] + (Thread/sleep (rand-int 500)) + (a/>!! c (inc input-value)) + (a/close! c)) + output-seq (async/map-pipeline-async afn 1030 input-seq)] + (is (= n (count output-seq))) + (is (not= input-seq output-seq)) + (is (= (set (map inc input-seq)) (set output-seq))))) diff --git a/test/ops/es_to_ndjson_test.clj b/test/ops/es_to_ndjson_test.clj new file mode 100644 index 0000000..f7ddcf9 --- /dev/null +++ b/test/ops/es_to_ndjson_test.clj @@ -0,0 +1,57 @@ +(ns ops.es-to-ndjson-test + (:require [clojure.test :refer [deftest is use-fixtures]] + [clojure.java.io :as io] + [clojure.tools.logging :as log] + [core.ilm :as ilm] + [sink.elasticsearch.index :as index] + [ops.es-to-ndjson :as es-to-ndjson] + [org.httpkit.client :as http] + [core.json :as json] + [scroll.request :as r])) + +(defn test-es-host [] + (or (System/getenv "ES_HOST") "http://localhost:9200")) + +(defn wait-for-elasticsearch [f] + (r/execute-request + {:method :get + :url (format "%s/_cluster/health" (test-es-host))}) + (f)) + +(use-fixtures :once wait-for-elasticsearch) + +(deftest ^:integration es-to-ndjson-operation + (let [es-host (or (System/getenv "ES_HOST") "http://localhost:9200") + source-index-name "reindex-source-test-index" + target-file "target/file.ndjson" + number-of-docs 10 + records (map (fn [x] {:_id x + :_source {:value x}}) (range number-of-docs))] + (if (.exists (io/file target-file)) + (io/delete-file target-file) + (io/make-parents target-file)) + (log/infof "Deleted source index='%s' at '%s': %s" + source-index-name es-host (ilm/delete-index! es-host source-index-name)) + (log/infof "Created source index='%s' at '%s': %s" + source-index-name es-host (ilm/create-index! es-host source-index-name)) + ; Fill source index with some records + (index/store! records {:connection.url es-host :dest.index source-index-name}) + (ilm/refresh-index! es-host source-index-name) + + (es-to-ndjson/es->ndjson + {:max_docs nil + :source {:implementation :elasticsearch + :remote {:host es-host} + :index source-index-name} + :sink {:implementation :file + :filename target-file}}) + (is (= 20 (count (line-seq (io/reader target-file))))) + + (is (false? (:errors + (json/decode + (:body + @(http/request + {:method :post + :url (str es-host "/_bulk") + :headers {"Content-Type" "application/x-ndjson"} + :body (slurp target-file)})))))))) diff --git a/test/ops/kafka_to_elasticsearch_test.clj b/test/ops/kafka_to_elasticsearch_test.clj new file mode 100644 index 0000000..bdbb086 --- /dev/null +++ b/test/ops/kafka_to_elasticsearch_test.clj @@ -0,0 +1,108 @@ +(ns ops.kafka-to-elasticsearch-test + (:require [clojure.test :refer [deftest is use-fixtures]] + [clojure.tools.logging :as log] + [core.ilm :as ilm] + [ops.kafka-to-es :as kafka-to-es] + [sink.kafka :as kafka] + [source.elasticsearch :as es] + [source.kafka :as source-kafka] + [scroll.request :as r] + [test-helpers :as th])) + +(defn test-es-host [] + (or (System/getenv "ES_HOST") "http://localhost:9200")) + +(def source-opts + {:impatient? true + :retry-count 1}) + +(defn wait-for-elasticsearch [f] + (r/execute-request + {:method :get + :url (format "%s/_cluster/health" (test-es-host))}) + (f)) + +(use-fixtures :once wait-for-elasticsearch) + +(deftest ^:integration ^:kafka kafka-to-es + (let [source-topic "source-topic" + boostrap-servers (or (System/getenv "KAFKA_BOOTSTRAP_SERVERS") "127.0.0.1:9092") + es-host (test-es-host) + records [{:key "key" + :value {:test "test"} + :headers {:meta "meta"}}] + dest-index-name "kafka-to-es-test-index"] + ; SCENARIO: + ; delete es index + ; delete source topic + ; create source topic + ; put a doc in kafka topic + ; check that the doc is in kafka topic + ; execute op + ; check that the elasticsearch has the doc from kafka + (log/infof "Deleted dest index='%s' at '%s': %s" + dest-index-name es-host (ilm/delete-index! es-host dest-index-name)) + (th/recreate-topics! [source-topic] {"bootstrap.servers" boostrap-servers}) + (is (nil? (kafka/store! records {:sink {:topic source-topic + :bootstrap.servers boostrap-servers}}))) + (is (seq (source-kafka/fetch {:max_docs 1 + :source (merge {:topic source-topic + :bootstrap.servers boostrap-servers} + source-opts)}))) + (is (nil? (kafka-to-es/kafka-to-es {:max_docs 1 + :source (merge {:topic source-topic + :bootstrap.servers boostrap-servers} + source-opts) + :dest {:index dest-index-name + :remote {:host es-host}} + :sink {}}))) + (ilm/refresh-index! es-host dest-index-name) + (let [docs (es/fetch {:max_docs 10 + :source {:remote {:host es-host} + :index dest-index-name}})] + (is (seq docs)) + (is (= (set (map :key records)) + (set (map :_id docs)))) + (is (= (set (map :value records)) + (set (map :_source docs))))))) + +(deftest ^:integration ^:kafka kafka-to-es-no-key + (let [source-topic "source-topic" + boostrap-servers (or (System/getenv "KAFKA_BOOTSTRAP_SERVERS") "127.0.0.1:9092") + es-host (or (System/getenv "ES_HOST") "http://localhost:9200") + records [{:key nil + :value {:test "test"} + :headers {:meta "meta"}}] + dest-index-name "kafka-to-es-test-index"] + ; SCENARIO: + ; delete es index + ; delete source topic + ; create source topic + ; put a doc in kafka topic + ; check that the doc is in kafka topic + ; execute op + ; check that the elasticsearch has the doc from kafka + (log/infof "Deleted dest index='%s' at '%s': %s" + dest-index-name es-host (ilm/delete-index! es-host dest-index-name)) + (th/recreate-topics! [source-topic] {"bootstrap.servers" boostrap-servers}) + (is (nil? (kafka/store! records {:sink {:topic source-topic + :bootstrap.servers boostrap-servers}}))) + (is (seq (source-kafka/fetch {:max_docs 1 + :source (merge {:topic source-topic + :bootstrap.servers boostrap-servers} + source-opts)}))) + (is (nil? (kafka-to-es/kafka-to-es {:max_docs 1 + :source (merge {:topic source-topic + :bootstrap.servers boostrap-servers} + source-opts) + :dest {:index dest-index-name + :remote {:host es-host}} + :sink {}}))) + (ilm/refresh-index! es-host dest-index-name) + (let [docs (es/fetch {:max_docs 10 + :source {:remote {:host es-host} + :index dest-index-name}})] + (is (seq docs)) + (is (= (set ["source-topic+0+0"]) (set (map :_id docs)))) + (is (= (set (map :value records)) + (set (map :_source docs))))))) diff --git a/test/ops/kafka_to_kafka_test.clj b/test/ops/kafka_to_kafka_test.clj new file mode 100644 index 0000000..cd592fe --- /dev/null +++ b/test/ops/kafka_to_kafka_test.clj @@ -0,0 +1,64 @@ +(ns ops.kafka-to-kafka-test + (:require [clojure.test :refer [deftest is]] + [ops.kafka-to-kafka :as kafka-to-kafka] + [source.kafka :as source-kafka] + [sink.kafka :as kafka] + [test-helpers :as th])) + +(def source-opts + {:impatient? true + :retry-count 1}) + +(deftest ^:integration ^:kafka kafka-to-kafka-op + (let [source-topic "source-topic" + sink-topic "sink-topic" + boostrap-servers (or (System/getenv "KAFKA_BOOTSTRAP_SERVERS") + "127.0.0.1:9092") + records [{:key "key" + :value {:test "test"} + :headers {:meta "meta"}}]] + ; SCENARIO: + ; delete source topic + ; delete sink topic + ; check that sink topic doesnt't have docs + ; write some data to source topic + ; run op + ; check sink topic for the document + (th/recreate-topics! [source-topic sink-topic] {"bootstrap.servers" boostrap-servers}) + (is (empty? (source-kafka/fetch {:max_docs 1 + :source (merge {:topic source-topic + :bootstrap.servers boostrap-servers} + source-opts)}))) + (is (empty? (source-kafka/fetch {:max_docs 1 + :source (merge {:topic sink-topic + :bootstrap.servers boostrap-servers} + source-opts)}))) + (is (nil? (kafka/store! records {:sink {:topic source-topic + :bootstrap.servers boostrap-servers}}))) + (is (seq (source-kafka/fetch {:max_docs 1 + :source (merge {:topic source-topic + :bootstrap.servers boostrap-servers} + source-opts)}))) + (is (empty? (source-kafka/fetch {:max_docs 1 + :source (merge {:topic sink-topic + :bootstrap.servers boostrap-servers} + source-opts)}))) + (is (nil? (kafka-to-kafka/execute {:max_docs 1 + :source (merge {:topic source-topic + :bootstrap.servers boostrap-servers} + source-opts) + :sink {:topic sink-topic + :bootstrap.servers boostrap-servers}}))) + (let [[first-record :as actual-records] + (source-kafka/fetch {:max_docs 1 + :source (merge {:topic sink-topic + :bootstrap.servers boostrap-servers} + source-opts)})] + (is (seq actual-records)) + (is (= 1 (count actual-records))) + (is (= (:value (first records)) + (:value first-record))) + (is (= (:key (first records)) + (:key first-record))) + (is (= (:headers (first records)) + (:headers first-record)))))) diff --git a/test/ops/reindex_test.clj b/test/ops/reindex_test.clj new file mode 100644 index 0000000..1e81f7a --- /dev/null +++ b/test/ops/reindex_test.clj @@ -0,0 +1,51 @@ +(ns ops.reindex-test + (:require [clojure.test :refer [deftest is]] + [clojure.tools.logging :as log] + [scroll :as scroll] + [sink.elasticsearch.index :as index] + [core.ilm :as ilm] + [ops.es-to-es :as reindex])) + +(deftest ^:integration in-cluster-reindex + (let [es-host (or (System/getenv "ES_HOST") "http://localhost:9200") + source-index-name "reindex-source-test-index" + dest-index-name "reindex-dest-test-index" + number-of-docs (+ 10 (rand-int 50)) + records (map (fn [x] {:_id x + :_source {:value x}}) (range number-of-docs))] + ; Sanity check that two indices are different + (is (not= source-index-name dest-index-name)) + (is (< 0 number-of-docs)) + + (log/infof "Deleted source index='%s' at '%s': %s" + source-index-name es-host (ilm/delete-index! es-host source-index-name)) + (log/infof "Deleted dest index='%s' at '%s': %s" + dest-index-name es-host (ilm/delete-index! es-host dest-index-name)) + + (log/infof "Created source index='%s' at '%s': %s" + source-index-name es-host (ilm/create-index! es-host source-index-name)) + (log/infof "Created dest index='%s' at '%s': %s" + dest-index-name es-host (ilm/create-index! es-host dest-index-name)) + + ; Fill source index with some records + (index/store! records {:connection.url es-host :dest.index source-index-name}) + (ilm/refresh-index! es-host source-index-name) + + ; PRE check that souce index has all docs + (is (= (set (map :_source records)) + (set (map :_source (scroll/hits {:es-host es-host :index-name source-index-name}))))) + + ; Initialize reindex job + (reindex/reindex! + {:source {:remote {:host es-host} + :index source-index-name} + :dest {:index dest-index-name + :remote {:host es-host}}}) + (ilm/refresh-index! es-host dest-index-name) + + ; Check if what is reindexed is the same as source data + (is (= (set (map :_source records)) + (set (map :_source (scroll/hits {:es-host es-host :index-name dest-index-name}))))) + ; Check if source and dest indices have the same content + (is (= (set (map :_source (scroll/hits {:es-host es-host :index-name source-index-name}))) + (set (map :_source (scroll/hits {:es-host es-host :index-name dest-index-name}))))))) diff --git a/test/polyglot_test.clj b/test/polyglot_test.clj new file mode 100644 index 0000000..08c2347 --- /dev/null +++ b/test/polyglot_test.clj @@ -0,0 +1,50 @@ +(ns polyglot-test + (:require [clojure.test :refer :all] + [polyglot.js :as js] + [core.json :as json] + [polyglot :as polyglot])) + +(def empty-map {}) +(def nested-map {:foo {:bar {:baz "quux"}}}) + +(deftest polyglot-string-transformations + (testing "simple cases" + (let [input-string (json/encode empty-map)] + (is (= input-string (js/string->string input-string "(s) => s"))) + (is (= (json/encode {"foo" "bar"}) + (js/string->string input-string "(s) => {s['foo'] = 'bar'; return s}"))))) + + (testing "deeply nested map" + (let [input-string (json/encode nested-map)] + (is (= (json/encode (assoc-in nested-map [:foo :bar :quuz] "corge")) + (js/string->string input-string + "(s) => { + s['foo']['bar']['quuz'] = 'corge'; + return s; + }"))))) + + (testing "invalid script" + (let [input-string (json/encode empty-map)] + (is (thrown? Exception (= input-string (js/string->string input-string "(s) => function"))))))) + +(deftest polyglot-map-transformations + (testing "simple cases" + (let [m empty-map] + (is (= m (polyglot/map->map m "(s) => s"))) + (is (= {:foo "bar"} + (polyglot/map->map m "(s) => {s['foo'] = 'bar'; return s}"))))) + + (testing "deeply nested map" + (let [m nested-map] + (is (= (assoc-in nested-map [:foo :bar :quuz] "corge") + (polyglot/map->map m "(s) => {s['foo']['bar']['quuz'] = 'corge'; return s}"))))) + + (testing "script expects two arguments" + (let [m empty-map] + (is (= m (polyglot/map->map m "(s) => s"))) + (is (= {:foo "bar"} + (polyglot/map->map m "(s, s1) => {s['foo'] = 'bar'; return s}"))))) + + (testing "invalid script" + (let [m empty-map] + (is (thrown? Exception (= m (polyglot/map->map m "(s) => function"))))))) diff --git a/test/replay/core_test.clj b/test/replay/core_test.clj new file mode 100644 index 0000000..7143781 --- /dev/null +++ b/test/replay/core_test.clj @@ -0,0 +1,25 @@ +(ns replay.core-test + (:require [clojure.test :refer [deftest is testing]] + [clojure.set :as cset] + [core.json :as json] + [replay.core :as replay])) + +(deftest hits-count-extraction + (testing "es <7 resp" + (let [resp-body (json/read-file "test/resources/es6-resp.json")] + (is (= {:value 0 :relation "eq"} + (replay/hits-count resp-body))))) + (testing "es 7 resp" + (let [resp-body (json/read-file "test/resources/es7-resp.json")] + (is (= {:value 0 :relation "eq"} + (replay/hits-count resp-body)))))) + +(deftest resp-construction + (let [input-doc (json/read-file "test/resources/replay-input-doc.json") + resp-body (json/read-file "test/resources/es7-resp.json")] + (is (cset/subset? + (set [:timestamp :id :query-id :endpoint :service-time :es-time :config]) + (set (keys (get-in (replay/post-process + input-doc "endpoint" (json/encode-vanilla {:query {:match_all {}}}) + 0 resp-body {:foo "foo"}) + [:_source :replay]))))))) diff --git a/test/replay/replay_with_transform_test.clj b/test/replay/replay_with_transform_test.clj new file mode 100644 index 0000000..eae5203 --- /dev/null +++ b/test/replay/replay_with_transform_test.clj @@ -0,0 +1,117 @@ +(ns replay.replay-with-transform-test + (:require [clojure.test :refer :all] + [clojure.tools.logging :as log] + [core.ilm :as ilm] + [core.json :as json] + [sink.elasticsearch.index :as index] + [scroll.request :as r] + [replay.core :as replay])) + +(defn test-es-host [] + (or (System/getenv "ES_HOST") "http://localhost:9200")) + +(defn wait-for-elasticsearch [f] + (r/execute-request + {:method :get + :url (format "%s/_cluster/health" (test-es-host))}) + (f)) + +(use-fixtures :once wait-for-elasticsearch) + +(defn recreate-index [es-host index-name] + (when (ilm/index-exists? es-host index-name) + (log/infof "Deleted source index='%s' at '%s': %s" + index-name es-host (ilm/delete-index! es-host index-name))) + (log/infof "Created index: %s" (ilm/create-index! es-host index-name))) + +(def simple-replay + {:id "test-replay" + :description "Test replay" + :query_attr "source" + :uri_attr "uri" + :connection.url (test-es-host) + :concurrency 1 + :repeats 1}) + +(deftest ^:integration es-to-es-replay + (let [target-index "target-index" + query {:query {:match {:title "foo"}} + :_source false} + queries [{:_id "query_1" + :_source {:source (json/encode query) + :uri (str "/" target-index "/_search")}}] + docs [{:_id "doc_1" + :_source {:title "foo"}}] + queries-index "replay-queries" + sink-index "sink-index"] + (recreate-index (test-es-host) queries-index) + (recreate-index (test-es-host) target-index) + (recreate-index (test-es-host) sink-index) + (index/store! queries {:connection.url (test-es-host) :dest.index queries-index}) + (index/store! docs {:connection.url (test-es-host) :dest.index target-index}) + ; check if sink-index is empty + (is (= 0 (count (scroll/hits {:es-host (test-es-host) :index-name sink-index})))) + (replay/replay + {:max_docs 1 + :source {:remote {:host (test-es-host)} + :index queries-index} + :replay simple-replay + :sink {:connection.url (test-es-host) + :dest.index sink-index}}) + ; check if sink-index is not empty + (let [sink-docs (scroll/hits {:es-host (test-es-host) :index-name sink-index}) + replay-hits (-> sink-docs + first + :_source + :replay + :response + json/decode + :hits + :hits)] + (is (= 1 (count sink-docs))) + (is (= 1 (count replay-hits))) + (is (nil? (:_source (first replay-hits))))))) + +(def replay-with-transform + (assoc simple-replay + :query-transforms [{:lang :js + :script "(q) => Object.assign(q, {'_source': true})"}])) + +(deftest ^:integration es-to-es-replay-with-transform + (let [target-index "target-index" + query {:query {:match {:title "foo"}} + :_source false} + queries [{:_id "query_1" + :_source {:source (json/encode query) + :uri (str "/" target-index "/_search")}}] + docs [{:_id "doc_1" + :_source {:title "foo"}}] + queries-index "replay-queries" + sink-index "sink-index"] + (recreate-index (test-es-host) queries-index) + (recreate-index (test-es-host) target-index) + (recreate-index (test-es-host) sink-index) + (index/store! queries {:connection.url (test-es-host) :dest.index queries-index}) + (index/store! docs {:connection.url (test-es-host) :dest.index target-index}) + ; check if sink-index is empty + (is (= 0 (count (scroll/hits {:es-host (test-es-host) :index-name sink-index})))) + (replay/replay + {:max_docs 1 + :source {:remote {:host (test-es-host)} + :index queries-index} + :replay replay-with-transform + :sink {:connection.url (test-es-host) + :dest.index sink-index}}) + ; check if sink-index is not empty + (let [sink-docs (scroll/hits {:es-host (test-es-host) :index-name sink-index}) + replay-hits (-> sink-docs + first + :_source + :replay + :response + json/decode + :hits + :hits)] + (is (= 1 (count sink-docs))) + (is (= 1 (count replay-hits))) + (is (= {:title "foo"} (:_source (first replay-hits))))))) diff --git a/test/replay/transform/query_test.clj b/test/replay/transform/query_test.clj new file mode 100644 index 0000000..6a5f175 --- /dev/null +++ b/test/replay/transform/query_test.clj @@ -0,0 +1,30 @@ +(ns replay.transform.query-test + (:require [clojure.test :refer :all] + [core.json :as json] + [replay.transform.query :as transform.query])) + +(deftest transforming-query-json-string + (testing "mixing the supported languages for transforms" + (let [query "{}" + tfs [{:lang :js + :script "(q) => Object.assign(q, {'_source': true})"} + {:lang :sci + :script "(fn [q] (assoc q :_explain true))"}] + transform-fn (transform.query/transform-fn tfs)] + (is (= {:_source true :_explain true} + (json/decode (transform-fn query)))))) + + (testing "the order of applied transformations" + (let [query "{}" + tfs [{:lang :js + :script "(q) => Object.assign(q, {'_source': 1})"} + {:lang :js + :script "(q) => Object.assign(q, {'_source': 2})"}] + transform-fn (transform.query/transform-fn tfs)] + (is (= {:_source 2} + (json/decode (transform-fn query)))))) + + (testing "when bad language id is provided" + (let [tfs [{:lang :not-existing + :script "(q) => q"}]] + (is (thrown? Exception (transform.query/transform-fn tfs)))))) diff --git a/test/replay/transform/uri_test.clj b/test/replay/transform/uri_test.clj new file mode 100644 index 0000000..8fed730 --- /dev/null +++ b/test/replay/transform/uri_test.clj @@ -0,0 +1,31 @@ +(ns replay.transform.uri-test + (:require [clojure.test :refer :all] + [replay.transform.uri :as transform-uri])) + +(deftest uri-transforms + (testing "simple string replacement" + (let [uri "/my_search_index/_search?q=elasticsearch" + transforms [{:match "my_search_index" :replacement "test"}]] + (is (= "/test/_search?q=elasticsearch" + (transform-uri/transform-uri uri transforms))))) + + (testing "_count replacement with _search size=0" + (let [uri "/my_search_index/_count?q=elasticsearch" + transforms [{:match "_count\\?" :replacement "_search?size=0&"}]] + (is (= "/my_search_index/_search?size=0&q=elasticsearch" + (transform-uri/transform-uri uri transforms)))))) + +(deftest endpoint-construction + (testing "uri is provided id replay conf" + (let [doc {} + replay-conf {:uri "foo"}] + (is (= "foo" (transform-uri/construct-endpoint doc replay-conf))))) + (testing "uri.attr specified" + (let [doc {:foo "_search"} + replay-conf {:uri_attr "foo"}] + (is (= "_search" (transform-uri/construct-endpoint doc replay-conf))))) + (testing "uri.attr specified with transforms" + (let [doc {:foo "/foo/_count?q=elastic"} + replay-conf {:uri_attr "foo" + :uri-transforms [{:match "_count\\?" :replacement "_search?size=0&"}]}] + (is (= "/foo/_search?size=0&q=elastic" (transform-uri/construct-endpoint doc replay-conf)))))) diff --git a/test/resources/es6-resp.json b/test/resources/es6-resp.json new file mode 100644 index 0000000..b6cbd5d --- /dev/null +++ b/test/resources/es6-resp.json @@ -0,0 +1,15 @@ +{ + "took" : 2, + "timed_out" : false, + "_shards" : { + "total" : 24, + "successful" : 24, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : 0, + "max_score" : null, + "hits" : [ ] + } +} diff --git a/test/resources/es7-resp.json b/test/resources/es7-resp.json new file mode 100644 index 0000000..6d07530 --- /dev/null +++ b/test/resources/es7-resp.json @@ -0,0 +1,18 @@ +{ + "took" : 1, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 0, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + } +} diff --git a/test/resources/replay-input-doc.json b/test/resources/replay-input-doc.json new file mode 100644 index 0000000..52a0253 --- /dev/null +++ b/test/resources/replay-input-doc.json @@ -0,0 +1,23 @@ +{ + "_index": "query_logs_20200724", + "_type": "_doc", + "_id": "query_logs_20200724+7+1544979", + "_version": 1544979, + "_score": null, + "_source": { + "header.offset": 1544979, + "header.topic": "storage-topic", + "header.timestamp": 1595626556589, + "body": "{\"query\":{\"match_all\":{}}}", + "uri": "/index-name/_count?terminate_after=99", + "header.partition": 7 + }, + "fields": { + "header.timestamp": [ + "2020-07-24T21:35:56.589Z" + ] + }, + "sort": [ + 1595626556589 + ] +} \ No newline at end of file diff --git a/test/sink/kafka_test.clj b/test/sink/kafka_test.clj new file mode 100644 index 0000000..2db3f6c --- /dev/null +++ b/test/sink/kafka_test.clj @@ -0,0 +1,61 @@ +(ns sink.kafka-test + (:require [clojure.test :refer [deftest is]] + [core.json :as json] + [sink.kafka :as kafka] + [source.kafka :as source-kafka] + [test-helpers :as th])) + +(def source-opts + {:impatient? true + :retry-count 1}) + +; When working from REPL is source.kafka if modified then the test fails +; you need to run tests once again +(deftest ^:integration ^:kafka sending-data-to-kafka-with-key-and-headers + (let [test-topic "test-topic" + boostrap-servers (or (System/getenv "KAFKA_BOOTSTRAP_SERVERS") + "127.0.0.1:9092") + kafka-opts {:topic test-topic + :bootstrap.servers boostrap-servers} + sink-opts {:sink kafka-opts} + records [{:key "key" + :value {:test "test"} + :headers {:meta "meta"}}]] + (th/recreate-topics! [test-topic] {"bootstrap.servers" boostrap-servers}) + (is (empty? (source-kafka/fetch {:max_docs 1 + :source (merge kafka-opts source-opts)}))) + (is (nil? (kafka/store! records sink-opts))) + (let [[first-record :as actual-records] + (source-kafka/fetch {:max_docs 1 + :source (merge kafka-opts source-opts)})] + (is (seq actual-records)) + (is (= 1 (count actual-records))) + (is (= (:value (first records)) + (:value first-record))) + (is (= (:key (first records)) + (:key first-record))) + (is (= (:headers (first records)) + (:headers first-record)))))) + +(deftest ^:integration ^:kafka sending-data-to-kafka-without-encoding + (let [test-topic "test-topic-sink-raw" + boostrap-servers (or (System/getenv "KAFKA_BOOTSTRAP_SERVERS") + "127.0.0.1:9092") + kafka-opts {:topic test-topic + :bootstrap.servers boostrap-servers} + sink-opts {:sink kafka-opts} + value {:test "test"} + records [{:key "key" + :value (json/encode value) + :headers {:meta "meta"}}]] + (th/recreate-topics! [test-topic] {"bootstrap.servers" boostrap-servers}) + (is (empty? (source-kafka/fetch {:max_docs 1 + :source (merge kafka-opts source-opts)}))) + (is (nil? (kafka/store! records (assoc-in sink-opts + [:sink :encode-value?] false)))) + (let [[first-record :as actual-records] + (source-kafka/fetch {:max_docs 1 + :source (merge kafka-opts source-opts)})] + (is (seq actual-records)) + (is (= 1 (count actual-records))) + (is (map? (:value first-record)))))) diff --git a/test/source/kafka_test.clj b/test/source/kafka_test.clj new file mode 100644 index 0000000..45fcde6 --- /dev/null +++ b/test/source/kafka_test.clj @@ -0,0 +1,53 @@ +(ns source.kafka-test + (:require [clojure.test :refer [deftest is testing]] + [source.kafka :as source-kafka] + [sink.kafka :as kafka] + [test-helpers :as th])) + +(def source-opts + {:impatient? true + :retry-count 1}) + +(deftest ^:integration ^:kafka kafka-consumer-kwyeordize-flag + (let [test-topic "keywords-test-topic" + boostrap-servers (or (System/getenv "KAFKA_BOOTSTRAP_SERVERS") + "127.0.0.1:9092") + kafka-opts {:topic test-topic + :bootstrap.servers boostrap-servers} + sink-opts {:sink kafka-opts} + records [{:key "key" + :value {:test "test"} + :headers {:meta "meta"}}]] + (th/recreate-topics! [test-topic] {"bootstrap.servers" boostrap-servers}) + (is (empty? (source-kafka/fetch {:max_docs 1 + :source (merge kafka-opts source-opts)}))) + (is (nil? (kafka/store! records sink-opts))) + + (testing "by default keys should be keywords" + (let [[first-record :as actual-records] + (source-kafka/fetch {:max_docs 1 + :source (merge kafka-opts source-opts)})] + (is (seq actual-records)) + (is (= 1 (count actual-records))) + (is (keyword? (first (keys (:value first-record))))) + (is (keyword? (first (keys (:headers first-record))))))) + + (testing "keys should be strings" + (let [[first-record :as actual-records] + (source-kafka/fetch {:max_docs 1 + :source (assoc (merge kafka-opts source-opts) + :keywordize? false)})] + (is (seq actual-records)) + (is (= 1 (count actual-records))) + (is (string? (first (keys (:value first-record))))) + (is (string? (first (keys (:headers first-record))))))) + + (testing "not decode value strings" + (let [[first-record :as actual-records] + (source-kafka/fetch {:max_docs 1 + :source (assoc (merge kafka-opts source-opts) + :decode-value? false)})] + (is (seq actual-records)) + (is (= 1 (count actual-records))) + (is (string? (:value first-record))) + (is (keyword? (first (keys (:headers first-record))))))))) diff --git a/test/test_helpers.clj b/test/test_helpers.clj new file mode 100644 index 0000000..143e1d7 --- /dev/null +++ b/test/test_helpers.clj @@ -0,0 +1,46 @@ +(ns test-helpers + (:require [clojure.tools.logging :as log]) + (:import (java.util.concurrent ExecutionException) + (java.util Map) + (org.apache.kafka.clients.admin AdminClient NewTopic))) + +(def default-exponential-backoff-params + {:time 100 + :rate 2 + :max 5000 + :p? identity}) + +(defn exponential-backoff + ([f] (exponential-backoff f default-exponential-backoff-params)) + ([f {:keys [time rate max p?] :as opts}] + (if (>= time max) ;; we're over budget, just call f + (f) + (try + (f) + (catch Throwable t + (if (p? t) + (do + (Thread/sleep time) + (exponential-backoff f (assoc opts :time (* time rate)))) + (throw t))))))) + +(defn create-topics! [^AdminClient kafka-admin topics] + (try + (.get (.all (.createTopics kafka-admin (map (fn [^String topic] (NewTopic. topic 1 (short 1))) topics)))) + (catch ExecutionException e + (log/warnf "Creating topics '%s' got exception '%s'" topics e)))) + +(defn delete-topic! [^AdminClient kafka-admin topics] + (try + (.get (.all (.deleteTopics kafka-admin topics))) + (catch ExecutionException e + (log/warnf "Deleting topics '%s' got exception '%s'" topics e)))) + +(defn recreate-topics! [topics ^Map opts] + (let [^AdminClient kafka-admin (AdminClient/create opts)] + (delete-topic! kafka-admin topics) + (Thread/sleep 500) + (create-topics! kafka-admin topics) + (exponential-backoff (fn [] + (log/infof "Details of a topics: %s" topics) + (.get (.all (.describeTopics kafka-admin topics)))))))