From 9deaf5ace8c439b7542cdf42640abaaba36d1fe6 Mon Sep 17 00:00:00 2001 From: abimichel Date: Wed, 24 Jul 2024 16:38:42 -0700 Subject: [PATCH] organize files and gha workflow --- .github/workflows/docker-ora2pg.yaml | 9 +- .../{docker-trino.yaml => docker-ora2s3.yaml} | 7 +- .github/workflows/docker-pg2pg.yaml | 5 +- requirements.txt | 12 - .../extractors/tap-rest-api-msdk--widen.lock | 357 ------------------ .../loaders/target-postgres--meltanolabs.lock | 260 ------------- Dockerfile => shared/ora2pg/Dockerfile | 2 +- .../ora2pg/data_replication_ora2pg.py | 0 shared/ora2pg/requirements.txt | 6 + shared/ora2s3/Dockerfile | 21 ++ shared/ora2s3/data_replication_ora2s3.py | 68 ++++ shared/ora2s3/requirements.txt | 4 + shared/pg2pg/requirements.txt | 9 +- shared/tools/meltano/Dockerfile | 3 + shared/{ => tools}/meltano/ods-dev/.gitignore | 0 .../meltano/ods-dev/analyze/.gitkeep | 0 .../meltano/ods-dev/extract/.gitkeep | 0 .../{ => tools}/meltano/ods-dev/load/.gitkeep | 0 .../{ => tools}/meltano/ods-dev/meltano.yml | 0 .../meltano/ods-dev/notebook/.gitkeep | 0 .../meltano/ods-dev/orchestrate/.gitkeep | 0 .../meltano/ods-dev/output/.gitignore | 0 .../meltano/ods-dev/requirements.txt} | 0 .../tools/meltano/ods-dev/transform/.gitkeep | 0 shared/{ => tools}/trino/Dockerfile | 0 25 files changed, 114 insertions(+), 649 deletions(-) rename .github/workflows/{docker-trino.yaml => docker-ora2s3.yaml} (93%) delete mode 100644 requirements.txt delete mode 100644 shared/meltano/ods-dev/plugins/extractors/tap-rest-api-msdk--widen.lock delete mode 100644 shared/meltano/ods-dev/plugins/loaders/target-postgres--meltanolabs.lock rename Dockerfile => shared/ora2pg/Dockerfile (93%) rename data_replication_parametrized_audit_os.py => shared/ora2pg/data_replication_ora2pg.py (100%) create mode 100644 shared/ora2pg/requirements.txt create mode 100644 shared/ora2s3/Dockerfile create mode 100644 shared/ora2s3/data_replication_ora2s3.py create mode 100644 shared/ora2s3/requirements.txt create mode 100644 shared/tools/meltano/Dockerfile rename shared/{ => tools}/meltano/ods-dev/.gitignore (100%) rename shared/{ => tools}/meltano/ods-dev/analyze/.gitkeep (100%) rename shared/{ => tools}/meltano/ods-dev/extract/.gitkeep (100%) rename shared/{ => tools}/meltano/ods-dev/load/.gitkeep (100%) rename shared/{ => tools}/meltano/ods-dev/meltano.yml (100%) rename shared/{ => tools}/meltano/ods-dev/notebook/.gitkeep (100%) rename shared/{ => tools}/meltano/ods-dev/orchestrate/.gitkeep (100%) rename shared/{ => tools}/meltano/ods-dev/output/.gitignore (100%) rename shared/{meltano/ods-dev/transform/.gitkeep => tools/meltano/ods-dev/requirements.txt} (100%) create mode 100644 shared/tools/meltano/ods-dev/transform/.gitkeep rename shared/{ => tools}/trino/Dockerfile (100%) diff --git a/.github/workflows/docker-ora2pg.yaml b/.github/workflows/docker-ora2pg.yaml index 977c699..466c5e7 100644 --- a/.github/workflows/docker-ora2pg.yaml +++ b/.github/workflows/docker-ora2pg.yaml @@ -2,12 +2,12 @@ name: Push to GHCR on: push: - branches: [ "main" ] + branches: ["main"] env: REGISTRY: ghcr.io - # github.repository as / # should be renamed to end in -ora2pg - IMAGE_NAME: ${{ github.repository }} + DOCKERFILE_PATH: shared/ora2pg + IMAGE_NAME: ${{ github.repository }}-ora2pg jobs: build: @@ -46,7 +46,8 @@ jobs: id: build-and-push uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09 # v5.0.0 with: - context: . + # DF-NOTE: to help the action find the Dockerfile to build from + context: ${{ env.DOCKERFILE_PATH }}/ push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/docker-trino.yaml b/.github/workflows/docker-ora2s3.yaml similarity index 93% rename from .github/workflows/docker-trino.yaml rename to .github/workflows/docker-ora2s3.yaml index fb2a9d0..789ce59 100644 --- a/.github/workflows/docker-trino.yaml +++ b/.github/workflows/docker-ora2s3.yaml @@ -5,10 +5,9 @@ on: branches: [ "main" ] env: - # DF-NOTE: pull ghcr.io/bcgov/nr-dap-ods-trino:main REGISTRY: ghcr.io - DOCKERFILE_PATH: shared/trino - IMAGE_NAME: ${{ github.repository }}-trino + DOCKERFILE_PATH: shared/ora2s3 + IMAGE_NAME: ${{ github.repository }}-ora2s3 jobs: build: @@ -61,5 +60,5 @@ jobs: # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable TAGS: ${{ steps.meta.outputs.tags }} DIGEST: ${{ steps.build-and-push.outputs.digest }} - + run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST} diff --git a/.github/workflows/docker-pg2pg.yaml b/.github/workflows/docker-pg2pg.yaml index 9f472a5..6c4efa9 100644 --- a/.github/workflows/docker-pg2pg.yaml +++ b/.github/workflows/docker-pg2pg.yaml @@ -5,9 +5,8 @@ on: branches: ["main"] env: - # DF-NOTE: pull ghcr.io/bcgov/nr-dap-ods-trino:main REGISTRY: ghcr.io - DOCKERFILE_PATH: shared/ods_replication_pg2pg + DOCKERFILE_PATH: shared/pg2pg IMAGE_NAME: ${{ github.repository }}-pg2pg jobs: @@ -61,5 +60,5 @@ jobs: # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable TAGS: ${{ steps.meta.outputs.tags }} DIGEST: ${{ steps.build-and-push.outputs.digest }} - + run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST} diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 9400b42..0000000 --- a/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -oracledb==1.3.1 -psycopg2==2.9.6 -pandas==2.0.2 -openpyxl==3.1.2 -configparser==6.0.0 -dbt-postgres==1.6.4 -PyYAML==6.0 -pyodbc==4.0.39 -python-dotenv==1.0.0 -boto3==1.28.10 -requests==2.31.0 -XlsxWriter==3.1.2 \ No newline at end of file diff --git a/shared/meltano/ods-dev/plugins/extractors/tap-rest-api-msdk--widen.lock b/shared/meltano/ods-dev/plugins/extractors/tap-rest-api-msdk--widen.lock deleted file mode 100644 index 5053a82..0000000 --- a/shared/meltano/ods-dev/plugins/extractors/tap-rest-api-msdk--widen.lock +++ /dev/null @@ -1,357 +0,0 @@ -{ - "plugin_type": "extractors", - "name": "tap-rest-api-msdk", - "namespace": "tap_rest_api_msdk", - "variant": "widen", - "label": "REST API", - "docs": "https://hub.meltano.com/extractors/tap-rest-api-msdk--widen", - "repo": "https://github.com/Widen/tap-rest-api-msdk", - "pip_url": "tap-rest-api-msdk", - "description": "REST API", - "logo_url": "https://hub.meltano.com/assets/logos/extractors/restapi.png", - "capabilities": [ - "about", - "batch", - "catalog", - "discover", - "schema-flattening", - "state", - "stream-maps" - ], - "settings_group_validation": [ - [ - "api_url" - ] - ], - "settings": [ - { - "name": "access_token_url", - "kind": "password", - "label": "Access Token URL", - "description": "Used for the OAuth2 authentication method. This is the end-point for the authentication server used to exchange the authorization codes for a access token.", - "sensitive": true - }, - { - "name": "api_keys", - "kind": "object", - "label": "API Keys", - "description": "A object of API Key/Value pairs used by the api_key auth method Example: { X-API-KEY: my secret value}." - }, - { - "name": "api_url", - "kind": "string", - "label": "API URL", - "description": "The base url/endpoint for the desired api" - }, - { - "name": "auth_method", - "kind": "string", - "value": "no_auth", - "label": "Auth Method", - "description": "The method of authentication used by the API. Supported options include oauth: for OAuth2 authentication, basic: Basic Header authorization - base64-encoded username + password config items, api_key: for API Keys in the header e.g. X-API-KEY,bearer_token: for Bearer token authorization, aws: for AWS Authentication. Defaults to `no_auth` which will take authentication parameters passed via the headersconfig." - }, - { - "name": "aws_credentials", - "kind": "object", - "label": "AWS Credentials", - "description": "An object of aws credentials to authenticate to access AWS services. This example is to access the AWS OpenSearch service. Example: { aws_access_key_id: my_aws_key_id, aws_secret_access_key: my_aws_secret_access_key, aws_region: us-east-1, aws_service: es, use_signed_credentials: true}" - }, - { - "name": "backoff_param", - "kind": "string", - "value": "Retry-After", - "label": "Backoff Param", - "description": "The header parameter to inspect for a backoff time. Optional: Defaults to `Retry-After`." - }, - { - "name": "backoff_time_extension", - "kind": "string", - "value": 0, - "label": "Backoff Time Extension", - "description": "An additional extension (seconds) to the backoff time over and above a jitter value - use where an API is not precise in its backoff times. Optional: Defaults to `0`." - }, - { - "name": "backoff_type", - "kind": "string", - "label": "Backoff Type", - "description": "The style of Backoff [message|header] applied to rate limited APIs. Backoff times (seconds) come from response either the `message` or `header`. Optional: Defaults to `None`." - }, - { - "name": "batch_config.encoding.compression", - "kind": "options", - "label": "Batch Config Encoding Compression", - "description": "Compression format to use for batch files.", - "options": [ - { - "label": "Gzip", - "value": "gzip" - }, - { - "label": "None", - "value": "none" - } - ] - }, - { - "name": "batch_config.encoding.format", - "kind": "options", - "label": "Batch Config Encoding Format", - "description": "Format to use for batch files.", - "options": [ - { - "label": "Jsonl", - "value": "jsonl" - } - ] - }, - { - "name": "batch_config.storage.prefix", - "kind": "string", - "label": "Batch Config Storage Prefix", - "description": "Prefix to use when writing batch files." - }, - { - "name": "batch_config.storage.root", - "kind": "string", - "label": "Batch Config Storage Root", - "description": "Root path to use when writing batch files." - }, - { - "name": "bearer_token", - "kind": "password", - "label": "Bearer Token", - "description": "Used for the Bearer Authentication method, which uses a token as part of the authorization header for authentication.", - "sensitive": true - }, - { - "name": "client_id", - "kind": "password", - "label": "Client ID", - "description": "Used for the OAuth2 authentication method. The public application ID that's assigned for Authentication. The client_id should accompany a client_secret.", - "sensitive": true - }, - { - "name": "client_secret", - "kind": "password", - "label": "Client Secret", - "description": "Used for the OAuth2 authentication method. The client_secret is a secret known only to the application and the authorization server. It is essential the application's own password.", - "sensitive": true - }, - { - "name": "except_keys", - "kind": "array", - "value": [], - "label": "Except Keys", - "description": "This tap automatically flattens the entire json structure and builds keys based on the corresponding paths. Keys, whether composite or otherwise, listed in this dictionary will not be recursively flattened, but instead their values will be; turned into a json string and processed in that format. This is also automatically done for any lists within the records; therefore, records are not duplicated for each item in lists." - }, - { - "name": "flattening_enabled", - "kind": "boolean", - "label": "Flattening Enabled", - "description": "'True' to enable schema flattening and automatically expand nested properties." - }, - { - "name": "flattening_max_depth", - "kind": "integer", - "label": "Flattening Max Depth", - "description": "The max depth to flatten schemas." - }, - { - "name": "grant_type", - "kind": "string", - "label": "Grant Type", - "description": "Used for the OAuth2 authentication method. The grant_type is required to describe the OAuth2 flow. Flows support by this tap include client_credentials, refresh_token, password." - }, - { - "name": "headers", - "kind": "object", - "label": "Headers", - "description": "An object of headers to pass into the api calls. Stream level headers will be merged with top-level params with streamlevel params overwriting top-level params with the same key." - }, - { - "name": "next_page_token_path", - "kind": "password", - "label": "Next Page Token Path", - "description": "A jsonpath string representing the path to the 'next page' token. Defaults to `$.next_page` for the `jsonpath_paginator` paginator only otherwise `None`.", - "sensitive": true - }, - { - "name": "num_inference_records", - "kind": "integer", - "value": 50, - "label": "Num Inference Records", - "description": "Number of records used to infer the stream's schema. Defaults to `50`." - }, - { - "name": "oauth_expiration_secs", - "kind": "integer", - "label": "OAuth Expiration Secs", - "description": "Used for OAuth2 authentication method. This optional setting is a timer for the expiration of a token in seconds. If not set the OAuth will use the default expiration set in the token by the authorization server." - }, - { - "name": "oauth_extras", - "kind": "object", - "label": "OAuth Extras", - "description": "A object of Key/Value pairs for additional oauth config parameters which may be required by the authorization server. Example: {resource: https://analysis.windows.net/powerbi/api}." - }, - { - "name": "pagination_limit_per_page_param", - "kind": "string", - "label": "Pagination Limit Per Page Param", - "description": "The name of the param that indicates the limit/per_page. Defaults to `None`." - }, - { - "name": "pagination_next_page_param", - "kind": "string", - "label": "Pagination Next Page Param", - "description": "The name of the param that indicates the page/offset. Defaults to `None`." - }, - { - "name": "pagination_page_size", - "kind": "integer", - "label": "Pagination Page Size", - "description": "The size of each page in records. Defaults to `None`." - }, - { - "name": "pagination_request_style", - "kind": "string", - "value": "default", - "label": "Pagination Request Style", - "description": "The pagination style to use for requests. Defaults to `default`." - }, - { - "name": "pagination_response_style", - "kind": "string", - "value": "default", - "label": "Pagination Response Style", - "description": "The pagination style to use for response. Defaults to `default`." - }, - { - "name": "pagination_results_limit", - "kind": "integer", - "label": "Pagination Results Limit", - "description": "Limits the max number of records. Defaults to `None`." - }, - { - "name": "pagination_total_limit_param", - "kind": "string", - "value": "total", - "label": "Pagination Total Limit Param", - "description": "The name of the param that indicates the total limit e.g. `total`, `count`. Defaults to `total`." - }, - { - "name": "params", - "kind": "object", - "value": {}, - "label": "Params", - "description": "An object providing the `params` in a `requests.get` method. Stream level params will be merged with top-level params with stream level params overwriting top-level params with the same key." - }, - { - "name": "password", - "kind": "password", - "label": "Password", - "description": "Used for a number of authentication methods that use a user password combination for authentication.", - "sensitive": true - }, - { - "name": "path", - "kind": "string", - "label": "Path", - "description": "The path appended to the `api_url`. Stream-level path will overwrite top-level path" - }, - { - "name": "primary_keys", - "kind": "array", - "label": "Primary Keys", - "description": "A list of the json keys of the primary key for the stream." - }, - { - "name": "records_path", - "kind": "string", - "label": "Records Path", - "description": "A jsonpath string representing the path in the requests response that contains the records to process. Defaults to `$[*]`. Stream level records_path will overwrite the top-level records_path" - }, - { - "name": "redirect_uri", - "kind": "string", - "label": "Redirect Uri", - "description": "Used for the OAuth2 authentication method. This is optional as the redirect_uri may be part of the token returned by the authentication server. If a redirect_uri is provided, it determines where the API server redirects the user after the user completes the authorization flow." - }, - { - "name": "refresh_token", - "kind": "password", - "label": "Refresh Token", - "description": "An OAuth2 Refresh Token is a string that the OAuth2 client can use to get a new access token without the user's interaction.", - "sensitive": true - }, - { - "name": "replication_key", - "kind": "password", - "label": "Replication Key", - "description": "The json response field representing the replication key. Note that this should be an incrementing integer or datetime object.", - "sensitive": true - }, - { - "name": "scope", - "kind": "string", - "label": "Scope", - "description": "Used for the OAuth2 authentication method. The scope is optional, it is a mechanism to limit the amount of access that is granted to an access token. One or more scopes can be provided delimited by a space." - }, - { - "name": "source_search_field", - "kind": "string", - "label": "Source Search Field", - "description": "An optional field name which can be used for querying specific records from supported API's. The intend for this parameter is to continue incrementally processing from a previous state. Example `last-updated`. Note: You must also set the replication_key, where the replication_key is json response representation of the API `source_search_field`. You should also supply the `source_search_query`, `replication_key` and `start_date`." - }, - { - "name": "source_search_query", - "kind": "string", - "label": "Source Search Query", - "description": "An optional query template to be issued against the API. Substitute the query field you are querying against with $last_run_date. At run-time, the tap will dynamically update the token with either the `start_date` or the last bookmark / state value. A simple template Example for FHIR APIs: gt$last_run_date. A more complex example against an Opensearch API, `\"{\\\"bool\\\": {\\\"filter\\\": [{\\\"range\\\": { \\\"meta.lastUpdated\\\": { \\\"gt\\\": \\\"$last_run_date\\\" }}}] }}\"`. Note: Any required double quotes in the query template must be escaped." - }, - { - "name": "start_date", - "kind": "date_iso8601", - "label": "Start Date", - "description": "An optional field. Normally required when using the replication_key. This is the initial starting date when using adate based replication key and there is no state available." - }, - { - "name": "store_raw_json_message", - "kind": "boolean", - "value": false, - "label": "Store Raw JSON Message", - "description": "An additional extension which will emit the whole message into an field. Optional: Defaults to `False`." - }, - { - "name": "stream_map_config", - "kind": "object", - "label": "Stream Map Config", - "description": "User-defined config values to be used within map expressions." - }, - { - "name": "stream_maps", - "kind": "object", - "label": "Stream Maps", - "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)." - }, - { - "name": "streams", - "kind": "array", - "label": "Streams", - "description": "An array of streams, designed for separate paths using thesame base url.\n\nStream level config options.\n\nParameters that appear at the stream-level will overwrite their top-level counterparts except where noted below:\n\n- name: required: name of the stream.\n- path: optional: the path appended to the `api_url`.\n- params: optional: an object of objects that provide the `params` in a `requests.get` method. Stream level params will be merged with top-level params with stream level params overwriting top-level params with the same key.\n- headers: optional: an object of headers to pass into the api calls. Stream level headers will be merged with top-level params with stream level params overwriting top-level params with the same key\n- records_path: optional: a jsonpath string representing the path in the requests response that contains the records to process. Defaults to `$[*]`.\n- primary_keys: required: a list of the json keys of the primary key for the stream.\n- replication_key: optional: the json key of the replication key. Note that this should be an incrementing integer or datetime object.\n- except_keys: This tap automatically flattens the entire json structure and builds keys based on the corresponding paths. Keys, whether composite or otherwise, listed in this dictionary will not be recursively flattened, but instead their values will be turned into a json string and processed in that format. This is also automatically done for any lists within the records; therefore, records are not duplicated for each item in lists.\n- num_inference_keys: optional: number of records used to infer the stream's schema. Defaults to `50`.\n- schema: optional: A valid Singer schema or a path-like string that provides the path to a `.json` file that contains a valid Singer schema. If provided, the schema will not be inferred from the results of an api call.\n" - }, - { - "name": "use_request_body_not_params", - "kind": "boolean", - "value": false, - "label": "Use Request Body Not Params", - "description": "Sends the request parameters in the request body. This is normally not required, a few API's like OpenSearch require this. Defaults to `False`." - }, - { - "name": "username", - "kind": "string", - "label": "Username", - "description": "Used for a number of authentication methods that use a user password combination for authentication." - } - ] -} \ No newline at end of file diff --git a/shared/meltano/ods-dev/plugins/loaders/target-postgres--meltanolabs.lock b/shared/meltano/ods-dev/plugins/loaders/target-postgres--meltanolabs.lock deleted file mode 100644 index 37d8f18..0000000 --- a/shared/meltano/ods-dev/plugins/loaders/target-postgres--meltanolabs.lock +++ /dev/null @@ -1,260 +0,0 @@ -{ - "plugin_type": "loaders", - "name": "target-postgres", - "namespace": "target_postgres", - "variant": "meltanolabs", - "label": "Postgres", - "docs": "https://hub.meltano.com/loaders/target-postgres--meltanolabs", - "repo": "https://github.com/MeltanoLabs/target-postgres", - "pip_url": "meltanolabs-target-postgres", - "executable": "target-postgres", - "description": "PostgreSQL database loader", - "logo_url": "https://hub.meltano.com/assets/logos/loaders/postgres.png", - "capabilities": [ - "about", - "hard-delete", - "schema-flattening", - "stream-maps" - ], - "settings_group_validation": [ - [] - ], - "settings": [ - { - "name": "activate_version", - "kind": "boolean", - "value": true, - "label": "Activate Version", - "description": "If set to false, the tap will ignore activate version messages. If set to true, add_record_metadata must be set to true as well." - }, - { - "name": "add_record_metadata", - "kind": "boolean", - "value": true, - "label": "Add Record Metadata", - "description": "Note that this must be enabled for activate_version to work!This adds _sdc_extracted_at, _sdc_batched_at, and more to every table. See https://sdk.meltano.com/en/latest/implementation/record_metadata.html for more information." - }, - { - "name": "database", - "kind": "string", - "label": "Database", - "description": "Database name. Note if sqlalchemy_url is set this will be ignored." - }, - { - "name": "default_target_schema", - "kind": "string", - "value": "$MELTANO_EXTRACT__LOAD_SCHEMA", - "label": "Default Target Schema", - "description": "Postgres schema to send data to, example: tap-clickup" - }, - { - "name": "dialect+driver", - "kind": "string", - "value": "postgresql+psycopg2", - "label": "Dialect+Driver", - "description": "Dialect+driver see https://docs.sqlalchemy.org/en/20/core/engines.html. Generally just leave this alone. Note if sqlalchemy_url is set this will be ignored." - }, - { - "name": "faker_config.locale", - "kind": "array", - "label": "Faker Config Locale", - "description": "One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization" - }, - { - "name": "faker_config.seed", - "kind": "string", - "label": "Faker Config Seed", - "description": "Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator" - }, - { - "name": "flattening_enabled", - "kind": "boolean", - "label": "Flattening Enabled", - "description": "'True' to enable schema flattening and automatically expand nested properties." - }, - { - "name": "flattening_max_depth", - "kind": "integer", - "label": "Flattening Max Depth", - "description": "The max depth to flatten schemas." - }, - { - "name": "hard_delete", - "kind": "boolean", - "value": false, - "label": "Hard Delete", - "description": "When activate version is sent from a tap this specefies if we should delete the records that don't match, or mark them with a date in the `_sdc_deleted_at` column. This config option is ignored if `activate_version` is set to false." - }, - { - "name": "host", - "kind": "string", - "label": "Host", - "description": "Hostname for postgres instance. Note if sqlalchemy_url is set this will be ignored." - }, - { - "name": "interpret_content_encoding", - "kind": "boolean", - "value": false, - "label": "Interpret Content Encoding", - "description": "If set to true, the target will interpret the content encoding of the schema to determine how to store the data. Using this option may result in a more efficient storage of the data but may also result in an error if the data is not encoded as expected." - }, - { - "name": "load_method", - "kind": "options", - "value": "append-only", - "label": "Load Method", - "description": "The method to use when loading data into the destination. `append-only` will always write all input records whether that records already exists or not. `upsert` will update existing records and insert new records. `overwrite` will delete all existing records and insert all input records.", - "options": [ - { - "label": "Append Only", - "value": "append-only" - }, - { - "label": "Upsert", - "value": "upsert" - }, - { - "label": "Overwrite", - "value": "overwrite" - } - ] - }, - { - "name": "password", - "kind": "password", - "label": "Password", - "description": "Password used to authenticate. Note if sqlalchemy_url is set this will be ignored.", - "sensitive": true - }, - { - "name": "port", - "kind": "integer", - "value": 5432, - "label": "Port", - "description": "The port on which postgres is awaiting connection. Note if sqlalchemy_url is set this will be ignored." - }, - { - "name": "sqlalchemy_url", - "kind": "string", - "label": "Sqlalchemy URL", - "description": "SQLAlchemy connection string. This will override using host, user, password, port, dialect, and all ssl settings. Note that you must escape password special characters properly. See https://docs.sqlalchemy.org/en/20/core/engines.html#escaping-special-characters-such-as-signs-in-passwords" - }, - { - "name": "ssh_tunnel.enable", - "kind": "boolean", - "value": false, - "label": "SSH Tunnel Enable", - "description": "Enable an ssh tunnel (also known as bastion host), see the other ssh_tunnel.* properties for more details" - }, - { - "name": "ssh_tunnel.host", - "kind": "string", - "label": "SSH Tunnel Host", - "description": "Host of the bastion host, this is the host we'll connect to via ssh" - }, - { - "name": "ssh_tunnel.port", - "kind": "integer", - "value": 22, - "label": "SSH Tunnel Port", - "description": "Port to connect to bastion host" - }, - { - "name": "ssh_tunnel.private_key", - "kind": "password", - "label": "SSH Tunnel Private Key", - "description": "Private Key for authentication to the bastion host", - "sensitive": true - }, - { - "name": "ssh_tunnel.private_key_password", - "kind": "password", - "label": "SSH Tunnel Private Key Password", - "description": "Private Key Password, leave None if no password is set", - "sensitive": true - }, - { - "name": "ssh_tunnel.username", - "kind": "string", - "label": "SSH Tunnel Username", - "description": "Username to connect to bastion host" - }, - { - "name": "ssl_certificate_authority", - "kind": "string", - "value": "~/.postgresql/root.crl", - "label": "SSL Certificate Authority", - "description": "The certificate authority that should be used to verify the server's identity. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. Note if sqlalchemy_url is set this will be ignored." - }, - { - "name": "ssl_client_certificate", - "kind": "string", - "value": "~/.postgresql/postgresql.crt", - "label": "SSL Client Certificate", - "description": "The certificate that should be used to verify your identity to the server. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. Note if sqlalchemy_url is set this will be ignored." - }, - { - "name": "ssl_client_certificate_enable", - "kind": "boolean", - "value": false, - "label": "SSL Client Certificate Enable", - "description": "Whether or not to provide client-side certificates as a method of authentication to the server. Use ssl_client_certificate and ssl_client_private_key for further customization. To use SSL to verify the server's identity, use ssl_enable instead. Note if sqlalchemy_url is set this will be ignored." - }, - { - "name": "ssl_client_private_key", - "kind": "password", - "value": "~/.postgresql/postgresql.key", - "label": "SSL Client Private Key", - "description": "The private key for the certificate you provided. Can be provided either as the certificate itself (in .env) or as a filepath to the certificate. Note if sqlalchemy_url is set this will be ignored.", - "sensitive": true - }, - { - "name": "ssl_enable", - "kind": "boolean", - "value": false, - "label": "SSL Enable", - "description": "Whether or not to use ssl to verify the server's identity. Use ssl_certificate_authority and ssl_mode for further customization. To use a client certificate to authenticate yourself to the server, use ssl_client_certificate_enable instead. Note if sqlalchemy_url is set this will be ignored." - }, - { - "name": "ssl_mode", - "kind": "string", - "value": "verify-full", - "label": "SSL Mode", - "description": "SSL Protection method, see [postgres documentation](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-PROTECTION) for more information. Must be one of disable, allow, prefer, require, verify-ca, or verify-full. Note if sqlalchemy_url is set this will be ignored." - }, - { - "name": "ssl_storage_directory", - "kind": "string", - "value": ".secrets", - "label": "SSL Storage Directory", - "description": "The folder in which to store SSL certificates provided as raw values. When a certificate/key is provided as a raw value instead of as a filepath, it must be written to a file before it can be used. This configuration option determines where that file is created." - }, - { - "name": "stream_map_config", - "kind": "object", - "label": "Stream Map Config", - "description": "User-defined config values to be used within map expressions." - }, - { - "name": "stream_maps", - "kind": "object", - "label": "Stream Maps", - "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)." - }, - { - "name": "user", - "kind": "string", - "label": "User", - "description": "User name used to authenticate. Note if sqlalchemy_url is set this will be ignored." - }, - { - "name": "validate_records", - "kind": "boolean", - "value": true, - "label": "Validate Records", - "description": "Whether to validate the schema of the incoming streams." - } - ], - "dialect": "postgres", - "target_schema": "$TARGET_POSTGRES_SCHEMA" -} \ No newline at end of file diff --git a/Dockerfile b/shared/ora2pg/Dockerfile similarity index 93% rename from Dockerfile rename to shared/ora2pg/Dockerfile index 748db31..319b61b 100644 --- a/Dockerfile +++ b/shared/ora2pg/Dockerfile @@ -26,4 +26,4 @@ COPY requirements.txt requirements.txt RUN pip3 install -r requirements.txt -CMD ["python3", "./data_replication_parametrized_audit_os.py"] \ No newline at end of file +CMD ["python3", "./data_replication_ora2pg.py"] \ No newline at end of file diff --git a/data_replication_parametrized_audit_os.py b/shared/ora2pg/data_replication_ora2pg.py similarity index 100% rename from data_replication_parametrized_audit_os.py rename to shared/ora2pg/data_replication_ora2pg.py diff --git a/shared/ora2pg/requirements.txt b/shared/ora2pg/requirements.txt new file mode 100644 index 0000000..8d03486 --- /dev/null +++ b/shared/ora2pg/requirements.txt @@ -0,0 +1,6 @@ +oracledb==1.3.1 +psycopg2==2.9.6 +pandas==2.0.2 +configparser==6.0.0 +pyodbc==4.0.39 +python-dotenv==1.0.0 \ No newline at end of file diff --git a/shared/ora2s3/Dockerfile b/shared/ora2s3/Dockerfile new file mode 100644 index 0000000..23701e9 --- /dev/null +++ b/shared/ora2s3/Dockerfile @@ -0,0 +1,21 @@ +FROM python + +WORKDIR /opt/oracle +RUN apt-get update && apt-get install -y libaio1 wget unzip \ + && wget https://download.oracle.com/otn_software/linux/instantclient/instantclient-basiclite-linuxx64.zip \ + && unzip instantclient-basiclite-linuxx64.zip \ + && rm -f instantclient-basiclite-linuxx64.zip \ + && cd /opt/oracle/instantclient* \ + && rm -f *jdbc* *occi* *mysql* *README *jar uidrvci genezi adrci \ + && echo /opt/oracle/instantclient* > /etc/ld.so.conf.d/oracle-instantclient.conf \ + && ldconfig + +WORKDIR /app + +ADD data_replication_ora2s3 . + +COPY requirements.txt requirements.txt + +RUN pip3 install -r requirements.txt + +CMD ["python3", "./data_replication_ora2s3"] \ No newline at end of file diff --git a/shared/ora2s3/data_replication_ora2s3.py b/shared/ora2s3/data_replication_ora2s3.py new file mode 100644 index 0000000..9721f2c --- /dev/null +++ b/shared/ora2s3/data_replication_ora2s3.py @@ -0,0 +1,68 @@ +import os +import io +import sys +import oracledb +import boto3 +import pandas + +objurl= os.environ['objurl'] +objbucket = os.environ['objbucket'] +objid = os.environ['objid'] +objkey = os.environ['objkey'] +s3key = os.environ['s3key'] + +username = os.environ['username'] +password = os.environ['password'] +host = os.environ['host'] +port = os.environ['port'] +database = os.environ['database'] +sql_query = os.environ['sql_query'] + +def extract_from_oracle(username, password, host, port, database, sql_query): + dsn = oracledb.makedsn(host=host, port=port, service_name=database) + print(dsn) + connection_pool = oracledb.SessionPool(user=username, password=password, dsn=dsn, encoding="UTF-8") + connection = connection_pool.acquire() + cursor = connection.cursor() + try: + # with open('query.sql', 'r') as sql_file: + # sql_query = sql_file.read() + print(sql_query) + cursor.execute(sql_query) + rows = cursor.fetchall() + column_names = [col[0] for col in cursor.description] + connection_pool.release(connection) + return rows, column_names + except Exception as e: + print(f"Error extracting data from Oracle: {str(e)}") + return [] + +def create_excel_dataframe(rows, column_names): + df = pandas.DataFrame(rows, columns=column_names) + with io.BytesIO() as output: + with pandas.ExcelWriter(output, engine='xlsxwriter') as writer: + df.to_excel(writer, index=False) + exceldata = output.getvalue() + return exceldata + +def upload_to_s3(objbucket, objid, objkey, s3key, exceldata): + try: + session = boto3.Session(aws_access_key_id=objid, aws_secret_access_key=objkey) + s3_client = boto3.client('s3',endpoint_url=objurl, aws_access_key_id=objid, aws_secret_access_key=objkey) + s3_resource = session.resource('s3', endpoint_url=objurl) + s3_client.delete_object(Bucket=objbucket,Key=s3key) + bucket = s3_resource.Bucket(objbucket) + bucket.put_object(Key=s3key, Body=exceldata) + except Exception as e: + raise Exception(f"S3 Error: {str(e)}") + +try: + rows, column_names = extract_from_oracle(username, password, host, port, database, sql_query) + exceldata = create_excel_dataframe(rows, column_names) + upload_to_s3(objbucket, objid, objkey, s3key, exceldata) + print(f'Successfully uploaded to S3 bucket {objbucket} with key {s3key}.') + sys.exit(0) + +except Exception as e: + print(f"Error: {str(e)}") + sys.exit(1) \ No newline at end of file diff --git a/shared/ora2s3/requirements.txt b/shared/ora2s3/requirements.txt new file mode 100644 index 0000000..dbfef02 --- /dev/null +++ b/shared/ora2s3/requirements.txt @@ -0,0 +1,4 @@ +oracledb +pandas +boto3 +xlsxwriter \ No newline at end of file diff --git a/shared/pg2pg/requirements.txt b/shared/pg2pg/requirements.txt index 9400b42..5459139 100644 --- a/shared/pg2pg/requirements.txt +++ b/shared/pg2pg/requirements.txt @@ -1,12 +1,5 @@ -oracledb==1.3.1 psycopg2==2.9.6 pandas==2.0.2 -openpyxl==3.1.2 configparser==6.0.0 -dbt-postgres==1.6.4 -PyYAML==6.0 pyodbc==4.0.39 -python-dotenv==1.0.0 -boto3==1.28.10 -requests==2.31.0 -XlsxWriter==3.1.2 \ No newline at end of file +python-dotenv==1.0.0 \ No newline at end of file diff --git a/shared/tools/meltano/Dockerfile b/shared/tools/meltano/Dockerfile new file mode 100644 index 0000000..25af3f1 --- /dev/null +++ b/shared/tools/meltano/Dockerfile @@ -0,0 +1,3 @@ +FROM meltano/meltano + +CMD ["--help"] \ No newline at end of file diff --git a/shared/meltano/ods-dev/.gitignore b/shared/tools/meltano/ods-dev/.gitignore similarity index 100% rename from shared/meltano/ods-dev/.gitignore rename to shared/tools/meltano/ods-dev/.gitignore diff --git a/shared/meltano/ods-dev/analyze/.gitkeep b/shared/tools/meltano/ods-dev/analyze/.gitkeep similarity index 100% rename from shared/meltano/ods-dev/analyze/.gitkeep rename to shared/tools/meltano/ods-dev/analyze/.gitkeep diff --git a/shared/meltano/ods-dev/extract/.gitkeep b/shared/tools/meltano/ods-dev/extract/.gitkeep similarity index 100% rename from shared/meltano/ods-dev/extract/.gitkeep rename to shared/tools/meltano/ods-dev/extract/.gitkeep diff --git a/shared/meltano/ods-dev/load/.gitkeep b/shared/tools/meltano/ods-dev/load/.gitkeep similarity index 100% rename from shared/meltano/ods-dev/load/.gitkeep rename to shared/tools/meltano/ods-dev/load/.gitkeep diff --git a/shared/meltano/ods-dev/meltano.yml b/shared/tools/meltano/ods-dev/meltano.yml similarity index 100% rename from shared/meltano/ods-dev/meltano.yml rename to shared/tools/meltano/ods-dev/meltano.yml diff --git a/shared/meltano/ods-dev/notebook/.gitkeep b/shared/tools/meltano/ods-dev/notebook/.gitkeep similarity index 100% rename from shared/meltano/ods-dev/notebook/.gitkeep rename to shared/tools/meltano/ods-dev/notebook/.gitkeep diff --git a/shared/meltano/ods-dev/orchestrate/.gitkeep b/shared/tools/meltano/ods-dev/orchestrate/.gitkeep similarity index 100% rename from shared/meltano/ods-dev/orchestrate/.gitkeep rename to shared/tools/meltano/ods-dev/orchestrate/.gitkeep diff --git a/shared/meltano/ods-dev/output/.gitignore b/shared/tools/meltano/ods-dev/output/.gitignore similarity index 100% rename from shared/meltano/ods-dev/output/.gitignore rename to shared/tools/meltano/ods-dev/output/.gitignore diff --git a/shared/meltano/ods-dev/transform/.gitkeep b/shared/tools/meltano/ods-dev/requirements.txt similarity index 100% rename from shared/meltano/ods-dev/transform/.gitkeep rename to shared/tools/meltano/ods-dev/requirements.txt diff --git a/shared/tools/meltano/ods-dev/transform/.gitkeep b/shared/tools/meltano/ods-dev/transform/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/shared/trino/Dockerfile b/shared/tools/trino/Dockerfile similarity index 100% rename from shared/trino/Dockerfile rename to shared/tools/trino/Dockerfile