From 0010f6021e6853be49c4b3821f4e1b24cc6f1fdb Mon Sep 17 00:00:00 2001 From: Rafael Pereira Date: Thu, 7 Jun 2018 23:44:50 -0500 Subject: [PATCH] Simplify & complete configuration + Docker release (#15) - Simplify configuration - Add Singer usage data collection - Automate Docker Hub release GH Issues: - #12 --- .circleci/config.yml | 87 ++++++++++++++++++++---- .pytest_cache/v/cache/lastfailed | 3 + .pytest_cache/v/cache/nodeids | 69 +++++++++++++++++++ Dockerfile | 7 ++ README.rst | 18 ++--- target_datadotworld/__init__.py | 2 +- target_datadotworld/cli.py | 9 +++ target_datadotworld/singer_analytics.py | 38 +++++++++++ target_datadotworld/target.py | 53 +++------------ target_datadotworld/utils.py | 5 -- tests/target_datadotworld/test_target.py | 25 ++----- tests/target_datadotworld/test_utils.py | 10 +-- tox.ini | 2 +- 13 files changed, 222 insertions(+), 106 deletions(-) create mode 100644 .pytest_cache/v/cache/lastfailed create mode 100644 .pytest_cache/v/cache/nodeids create mode 100644 Dockerfile create mode 100644 target_datadotworld/singer_analytics.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 037eab4..44ef83b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,15 +1,11 @@ version: 2 jobs: - build: + test: docker: - image: dataworld/pyenv-tox working_directory: /root/target-datadotworld - environment: - PRERELEASE_BRANCH: prerelease - RELEASE_BRANCH: release - steps: - checkout @@ -26,23 +22,86 @@ jobs: name: tox command: tox --pre + - persist_to_workspace: + root: . + paths: + - ./* + - save_cache: key: tox_cache-{{ checksum "tox.ini" }} paths: - .eggs - .tox + pypi-release: + + docker: + - image: dataworld/pyenv-tox + + working_directory: /root/target-datadotworld + + steps: + - attach_workspace: + at: /root/target-datadotworld + + - run: + name: build dist + command: python setup.py sdist bdist_wheel --universal + - deploy: - name: Pre-release to pypi + name: pypi release + command: twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD dist/* + + docker-release: + docker: + - image: dataworld/pyenv-tox + + working_directory: /root/target-datadotworld + + steps: + - attach_workspace: + at: /root/target-datadotworld + + - run: + name: define PACKAGE_VERSION command: | - if [[ "${CIRCLE_BRANCH}" =~ ^(${PRERELEASE_BRANCH})$ ]]; then - echo 'Do a prerelease with twine here' - fi + echo "export PACKAGE_VERSION=$(python -c "import pkg_resources; print(pkg_resources.get_distribution('target-datadotworld').version)")" >> $BASH_ENV + source $BASH_ENV + + - setup_remote_docker: + docker_layer_caching: true + + - run: + name: docker setup + command: curl -sSL https://get.docker.com/ | sh + + - run: + name: docker build + command: docker build -t dataworld/target-datadotworld -t dataworld/target-datadotworld:$PACKAGE_VERSION . - deploy: - name: Release to pypi + name: docker-hub release command: | - if [[ "${CIRCLE_BRANCH}" =~ ^(${RELEASE_BRANCH})$ ]]; then - python setup.py sdist bdist_wheel --universal - twine upload -u $PYPI_USERNAME -p $PYPI_PASSWORD dist/* - fi + docker login -u $DOCKER_USER -p $DOCKER_PASS + docker push dataworld/target-datadotworld:latest + docker push dataworld/target-datadotworld:$PACKAGE_VERSION + +workflows: + version: 2 + test-double-release: + jobs: + - test + - pypi-release: + filters: + branches: + only: + - release + requires: + - test + - docker-release: + filters: + branches: + only: + - release + requires: + - test \ No newline at end of file diff --git a/.pytest_cache/v/cache/lastfailed b/.pytest_cache/v/cache/lastfailed new file mode 100644 index 0000000..902ea2a --- /dev/null +++ b/.pytest_cache/v/cache/lastfailed @@ -0,0 +1,3 @@ +{ + "tests/target_datadotworld/test_target.py::TestTarget::()": true +} \ No newline at end of file diff --git a/.pytest_cache/v/cache/nodeids b/.pytest_cache/v/cache/nodeids new file mode 100644 index 0000000..e95f8e6 --- /dev/null +++ b/.pytest_cache/v/cache/nodeids @@ -0,0 +1,69 @@ +[ + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream[5]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream[10]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream[15]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked[5-3]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked[5-5]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked[10-3]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked[10-5]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked[15-3]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked[15-5]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked_error[5-3]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked_error[5-5]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked_error[10-3]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked_error[10-5]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked_error[15-3]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_chunked_error[15-5]", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_append_stream_error", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_connection_check", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_connection_check_401", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_connection_check_offline", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_get_current_version", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_get_current_version_missing_column", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_get_current_version_missing_table", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_get_current_version_error", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_set_stream_schema", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_set_stream_schema_error", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_sync", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_sync_error", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_truncate_stream_records", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_truncate_stream_records_error", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_create_dataset", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_create_dataset_error", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_get_dataset", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test_get_dataset_error", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test__retry_if_throttled", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test__retry_if_throttled_delayed", + "tests/target_datadotworld/test_api_client.py::TestApiClient::()::test__retry_if_throttled_error", + "tests/target_datadotworld/test_exceptions.py::test_convert_requests_exception[400-ApiError]", + "tests/target_datadotworld/test_exceptions.py::test_convert_requests_exception[401-UnauthorizedError]", + "tests/target_datadotworld/test_exceptions.py::test_convert_requests_exception[403-ForbiddenError]", + "tests/target_datadotworld/test_exceptions.py::test_convert_requests_exception[404-NotFoundError]", + "tests/target_datadotworld/test_exceptions.py::test_convert_requests_exception[422-ApiError]", + "tests/target_datadotworld/test_exceptions.py::test_convert_requests_exception[429-TooManyRequestsError]", + "tests/target_datadotworld/test_exceptions.py::test_convert_requests_exception_offline", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_config_minimal", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_config_complete", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_config_incomplete", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_config_invalid[invalid_config0]", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_config_invalid[invalid_config1]", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_config_invalid[invalid_config2]", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_config_invalid[invalid_config3]", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_config_invalid[invalid_config4]", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_config_invalid[invalid_config5]", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_process_lines", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_process_lines_new_dataset", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_process_lines_unparseable", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_process_lines_missing_schema", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_process_lines_multiple_streams", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_process_no_state", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_process_multi_state", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_process_same_version", + "tests/target_datadotworld/test_target.py::TestTarget::()::test_process_new_version", + "tests/target_datadotworld/test_utils.py::test_to_jsonline", + "tests/target_datadotworld/test_utils.py::test_to_chunks[5]", + "tests/target_datadotworld/test_utils.py::test_to_chunks[10]", + "tests/target_datadotworld/test_utils.py::test_to_chunks[15]", + "tests/target_datadotworld/test_utils.py::test_to_streamid[aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa]", + "tests/target_datadotworld/test_utils.py::test_to_streamid[a1!_b@2_c3-a-1-b-2-c-3]" +] \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cc9b9e0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +FROM python:latest + +ADD . /code +WORKDIR /code + +RUN pip install . +CMD ["target-datadotworld"] diff --git a/README.rst b/README.rst index bd7d3eb..7d03718 100644 --- a/README.rst +++ b/README.rst @@ -7,8 +7,8 @@ A `Singer `_ target that writes data to `data.world `_ to move -data from sources like `SalesForce `_, `HubSpot `_, `Marketo `_, `MySQL `_ and `more `_ to data.world. +``target-datadotworld`` works together with any other `Singer Tap `_ to store on data.world +data extracted from sources like `SalesForce `_, `HubSpot `_, `Marketo `_, `MySQL `_ and `more `_. Install and Run --------------- @@ -30,24 +30,21 @@ and then run them together, piping the output of ``tap-fixerio`` to The data will be written to the dataset specified in ``config.json``. In this specific case, under a stream named ``exchange-rates``. If you're using a different Tap, substitute ``tap-fixerio`` in the final -command above to the command used to run your Tap. +command above with the command used to run your Tap. Configuration ------------- -`target-datadotworld` requires configuration file that is used to store your data.world API token, dataset information and other additional configuration. +`target-datadotworld` requires configuration file that is used to store your data.world API token and dataset information. The following attributes are required: * ``api_token``: Your data.world `API token `_ * ``dataset_id``: The title of the dataset where the data is to be stored. Must only contain lowercase letters, numbers, and dashes. -Additionally, the following optional attributes can be provided. They determine the parameters for creating a new dataset if ``dataset_id`` refers to a dataset that doesn't yet exist: +Additionally, the following optional attributes can be provided. -* ``dataset_title``: Text with no more than 60 characters -* ``dataset_visibility``: OPEN or PRIVATE -* ``dataset_license``: Public Domain, PDDL, CC-0, CC-BY, ODC-BY, CC-BY-SA, ODC-ODbL, CC BY-NC, CC BY-NC-SA or Other -* ``dataset_owner``: If not the same as the owner of the API token (e.g. if the dataset is to be created under an organization account, as opposed to the user's own) +* ``dataset_owner``: If not the same as the owner of the API token (e.g. if the dataset is to be accessed/created under an organization account, as opposed to the user's own) Example: @@ -56,8 +53,5 @@ Example: { "api_token": "your_token", "dataset_id": "fixerio-data", - "dataset_title": "Fixerio Data", - "dataset_license": "Other", "dataset_owner": "my-company", - "dataset_visibility": "PRIVATE" } diff --git a/target_datadotworld/__init__.py b/target_datadotworld/__init__.py index f0c61be..e8d8cf4 100644 --- a/target_datadotworld/__init__.py +++ b/target_datadotworld/__init__.py @@ -21,6 +21,6 @@ import singer -__version__ = '1.0.0b4' +__version__ = '1.0.0' logger = copy(singer.get_logger()) # copy needed in order to set level diff --git a/target_datadotworld/cli.py b/target_datadotworld/cli.py index 2b301b8..68b5bf7 100644 --- a/target_datadotworld/cli.py +++ b/target_datadotworld/cli.py @@ -21,11 +21,13 @@ import json import logging import warnings +from concurrent.futures import ThreadPoolExecutor import click from target_datadotworld import logger from target_datadotworld.exceptions import Error +from target_datadotworld.singer_analytics import send_usage_stats from target_datadotworld.target import TargetDataDotWorld @@ -56,6 +58,13 @@ def cli(ctx, config, debug, file): try: config_obj = json.load(config) + if not config_obj.get('disable_collection', False): + logger.info('Sending version information to singer.io. ' + + 'To disable sending anonymous usage data, set ' + + 'the config parameter "disable_collection" to true') + loop.run_in_executor(ThreadPoolExecutor(max_workers=1), + send_usage_stats) + target = TargetDataDotWorld(config_obj) data_file = file or click.get_text_stream('stdin') diff --git a/target_datadotworld/singer_analytics.py b/target_datadotworld/singer_analytics.py new file mode 100644 index 0000000..7b6a078 --- /dev/null +++ b/target_datadotworld/singer_analytics.py @@ -0,0 +1,38 @@ +# target-datadotworld +# Copyright 2017 data.world, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the +# License. +# +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. +# +# This product includes software developed at +# data.world, Inc.(http://data.world/). +import requests +import target_datadotworld +from target_datadotworld import logger +from requests import HTTPError + + +def send_usage_stats(): + try: + version = target_datadotworld.__version__ + resp = requests.get('http://collector.singer.io/i', + params={ + 'e': 'se', + 'aid': 'singer', + 'se_ca': 'target-datadotworld', + 'se_ac': 'open', + 'se_la': version, + }, timeout=0.5) + resp.raise_for_status() + except HTTPError: + logger.debug('Collection request failed') diff --git a/target_datadotworld/target.py b/target_datadotworld/target.py index b2ca232..403de74 100644 --- a/target_datadotworld/target.py +++ b/target_datadotworld/target.py @@ -32,7 +32,7 @@ from target_datadotworld.exceptions import NotFoundError, TokenError, \ ConfigError, MissingSchemaError, InvalidRecordError, \ UnparseableMessageError, InvalidDatasetStateError -from target_datadotworld.utils import to_dataset_id, to_stream_id +from target_datadotworld.utils import to_stream_id #: Json schema specifying what is required in the config.json file CONFIG_SCHEMA = config_schema = { @@ -49,45 +49,20 @@ 'dataset_id': { 'description': 'Target dataset id', 'type': 'string', - 'pattern': '[a-z0-9](?:-(?!-)|[a-z0-9]){1,93}[a-z0-9]' - }, - 'dataset_title': { - 'description': 'Title for new dataset created', - 'type': 'string', - 'minLength': 3, - 'maxLength': 60 - }, - 'dataset_license': { - 'description': 'License for new dataset created', - 'type': 'string', - 'enum': [ - 'Public Domain', 'PDDL', 'CC-0', 'CC-BY', - 'ODC-BY', 'CC-BY-SA', 'ODC-ODbL', 'CC BY-NC', - 'CC BY-NC-SA', 'Other' - ] - }, - 'dataset_visibility': { - 'description': 'Visibility for new dataset created', - 'type': 'string', - 'enum': ['OPEN', 'PRIVATE'] + 'pattern': '^[a-z0-9](?:-(?!-)|[a-z0-9]){1,93}[a-z0-9]$' }, 'dataset_owner': { 'description': 'Account for new dataset created, ' 'if not the owner of the token', 'type': 'string', - 'pattern': '[a-z0-9](?:-(?!-)|[a-z0-9]){1,29}[a-z0-9]' + 'pattern': '^[a-z0-9](?:-(?!-)|[a-z0-9]){1,29}[a-z0-9]$' }, - 'namespace': { - 'description': 'Target dataset title (reserved for Stitch)', - 'type': 'string', - 'minLength': 3, - 'maxLength': 60 + 'disable_collection': { + 'description': 'If False, disables Singer usage data collection', + 'type': 'boolean' } }, - 'oneOf': [ - {'required': ['api_token', 'namespace']}, - {'required': ['api_token', 'dataset_id']} - ] + 'required': ['api_token', 'dataset_id'] } @@ -170,9 +145,8 @@ async def _fix_dataset(self): self._api_client.create_dataset( self.config['dataset_owner'], self.config['dataset_id'], - title=self.config['dataset_title'], - visibility=self.config['dataset_visibility'], - license=self.config['dataset_license']) + title=self.config['dataset_id'], + visibility='PRIVATE') async def _handle_active_version_msg(self, msg, current_version, api): if current_version is None: @@ -285,13 +259,6 @@ def config(self, config): raise TokenError() self._config = copy(config) - self._config['dataset_id'] = (config.get('dataset_id') or - to_dataset_id(config.get('namespace'))) - self._config['dataset_title'] = (config.get('dataset_title') or - config.get('namespace') or - config.get('dataset_id')) + self._config['dataset_id'] = config.get('dataset_id') self._config['dataset_owner'] = config.get( 'dataset_owner', sub_parties[1]) - self._config['dataset_visibility'] = config.get( - 'dataset_visibility', 'PRIVATE') - self._config['dataset_license'] = config.get('dataset_license') diff --git a/target_datadotworld/utils.py b/target_datadotworld/utils.py index bd4534d..5577c81 100644 --- a/target_datadotworld/utils.py +++ b/target_datadotworld/utils.py @@ -71,11 +71,6 @@ def to_stream_id(stream_name): return kebab_case(stream_name)[0:95] -def to_dataset_id(dataset_title): - """Convert any string into a valid dataset ID""" - return kebab_case(dataset_title)[0:95] - - def to_table_name(stream_id): """Convert a stream ID into a table name""" return stream_id.replace('-', '_') diff --git a/tests/target_datadotworld/test_target.py b/tests/target_datadotworld/test_target.py index a626170..4bb725b 100644 --- a/tests/target_datadotworld/test_target.py +++ b/tests/target_datadotworld/test_target.py @@ -85,23 +85,18 @@ def sample_config(self): '9FsdsBZ03wx0A-QK1wq2tGyinaqUcjaotp-rnWCMoMOY83ivypu' 'B3FcjTGzJPFIGZbJsES_bx0itijwz5mQvg', 'dataset_id': 'my-dataset', - 'dataset_title': 'My Dataset', - 'dataset_license': 'Other', - 'dataset_owner': 'rafael', - 'dataset_visibility': 'PRIVATE' + 'dataset_owner': 'rafael' } @pytest.fixture(params=[ ('api_token', 'eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiJwcm9kLXVzZXItY2xpZW' '50OnJhZmFlbCIsImlzcyI6ImFnZW50OnJhZmFlbDo6YjY1NTgxO' 'DItMjRkNy00MWZiLTkxNTAtNjZl'), - ('dataset_license', 'NOTALICENSE'), ('dataset_owner', 'x'), ('dataset_owner', 'Mr.X'), - ('dataset_visibility', 'Invisible'), - ('dataset_title', 'xyz' * 43), - ('dataset_title', 'xy'), - ('namespace', 'me too me too') + ('dataset_owner', 'Acme, Inc.'), + ('dataset_id', 'd'), + ('dataset_id', 'I am a non-conformist') ]) def invalid_config(self, request, sample_config): invalid_config = copy(sample_config) @@ -115,18 +110,6 @@ def test_config_minimal(self, sample_config): } target = TargetDataDotWorld(minimal_config) expected_config = copy(sample_config) - expected_config['dataset_license'] = None - expected_config['dataset_title'] = expected_config['dataset_id'] - assert_that(target.config, has_entries(expected_config)) - - def test_config_namespace(self, sample_config): - minimal_config = { - 'api_token': sample_config['api_token'], - 'namespace': sample_config['dataset_title'] - } - target = TargetDataDotWorld(minimal_config) - expected_config = copy(sample_config) - expected_config['dataset_license'] = None assert_that(target.config, has_entries(expected_config)) def test_config_complete(self, sample_config): diff --git a/tests/target_datadotworld/test_utils.py b/tests/target_datadotworld/test_utils.py index a1cbe31..739a23b 100644 --- a/tests/target_datadotworld/test_utils.py +++ b/tests/target_datadotworld/test_utils.py @@ -26,7 +26,7 @@ from hamcrest import equal_to from target_datadotworld.utils import to_chunks, to_jsonlines, \ - to_stream_id, to_dataset_id + to_stream_id def test_to_jsonline(): @@ -54,11 +54,3 @@ async def test_to_chunks(records_queue): ]) def test_to_streamid(text, streamid): assert_that(to_stream_id(text), equal_to(streamid)) - - -@pytest.mark.parametrize('text,datasetid', [ - ('a' * 100, 'a' * 95), - ('a1!_b@2_c3', 'a-1-b-2-c-3') -]) -def test_to_dataset_id(text, datasetid): - assert_that(to_dataset_id(text), equal_to(datasetid)) diff --git a/tox.ini b/tox.ini index ced4a41..e7d7bf2 100644 --- a/tox.ini +++ b/tox.ini @@ -8,4 +8,4 @@ commands= flake8 . \ --exclude=./.tox/*,./.eggs/*,./build/* coverage run setup.py test {posargs} - coverage report --omit=./.tox/* --fail-under=90 + coverage report --omit=.tox/*,target_datadotworld/cli.py --fail-under=90