diff --git a/.code_quality/bandit.yaml b/.code_quality/bandit.yaml deleted file mode 120000 index d35efac..0000000 --- a/.code_quality/bandit.yaml +++ /dev/null @@ -1 +0,0 @@ -../{{cookiecutter.repo_name}}/.code_quality/bandit.yaml \ No newline at end of file diff --git a/Makefile b/Makefile index 642a390..25911e4 100644 --- a/Makefile +++ b/Makefile @@ -28,9 +28,13 @@ docs_view: ## Build and serve the documentation @echo "🚀 Viewing documentation..." @poetry run mkdocs serve -docs-test: ## Test if documentation can be built without warnings or errors +docs_test: ## Test if documentation can be built without warnings or errors @poetry run mkdocs build -s +view_tree: ## View the project tree + @echo "🚀 Viewing project tree..." + @tree -a {{cookiecutter.repo_name}} -I '__init__.py|.gitkeep' + ####----Tests----#### test: ## Test the code with pytest and coverage diff --git a/README.md b/README.md index a63e418..5ac298b 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,6 @@ Folder structure for data science projects [why?](https://towardsdatascience.co . ├── codecov.yml # configuration for codecov ├── .code_quality -│   ├── bandit.yaml # bandit configuration │   ├── mypy.ini # mypy configuration │   └── ruff.toml # ruff configuration ├── data @@ -127,12 +126,13 @@ Folder structure for data science projects [why?](https://towardsdatascience.co │   │   └── action.yml # github action to setup python environment │   ├── dependabot.md # github action to update dependencies │   ├── pull_request_template.md # template for pull requests -│   └── workflows -│   ├── docs.yml # github action to build documentation (mkdocs) -│   ├── pre-commit_autoupdate.yml # github action update pre-commit hooks -│   └── test.yml +│   └── workflows # github actions workflows +│   ├── ci.yml # run continuous integration (tests, pre-commit, etc.) +│   ├── dependency_review.yml # review dependencies +│   ├── docs.yml # build documentation (mkdocs) +│   └── pre-commit_autoupdate.yml # update pre-commit hooks ├── .gitignore # files to ignore in git -├── Makefile # useful commands to setup environment, +├── Makefile # useful commands to setup environment, run tests, etc. ├── models # store final models ├── notebooks │   ├── 1-data # data extraction and cleaning @@ -149,9 +149,34 @@ Folder structure for data science projects [why?](https://towardsdatascience.co ├── pyproject.toml # dependencies for poetry ├── README.md # description of your project ├── src # source code for use in this project +│ ├── libs # custom python scripts +│ │ ├── data_etl # data extraction, transformation, and loading +│ │ ├── data_validation # data validation +│ │ ├── feat_cleaning # feature engineering data cleaning +│ │ ├── feat_encoding # feature engineering encoding +│ │ ├── feat_imputation # feature engineering imputation +│ │ ├── feat_new_features # feature engineering new features +│ │ ├── feat_pipelines # feature engineering pipelines +│ │ ├── feat_preprocess_strings # feature engineering pre process strings +│ │ ├── feat_scaling # feature engineering scaling data +│ │ ├── feat_selection # feature engineering feature selection +│ │ ├── feat_strings # feature engineering strings +│ │ ├── metrics # evaluation metrics +│ │ ├── model # model training and prediction +│ │ ├── model_evaluation # model evaluation +│ │ ├── model_selection # model selection +│ │ ├── model_validation # model validation +│ │ └── reports # reports +│ ├── pipelines +│ │ ├── data_etl # data extraction, transformation, and loading +│ │ ├── feature_engineering # prepare data for modeling +│ │ ├── model_evaluation # evaluate model performance +│ │ ├── model_prediction # model predictions +│ │ └── model_train # train models ├── tests # test code for your project └── .vscode # vscode configuration ├── extensions.json # list of recommended extensions + ├── launch.json # vscode launch configuration └── settings.json # vscode settings ``` diff --git a/docs/data_schema.md b/docs/data_schema.md new file mode 100644 index 0000000..b6209db --- /dev/null +++ b/docs/data_schema.md @@ -0,0 +1 @@ +--8<-- "{{cookiecutter.repo_name}}/data/README.md" diff --git a/docs/directory_hierarchy.md b/docs/directory_hierarchy.md new file mode 100644 index 0000000..1110ce7 --- /dev/null +++ b/docs/directory_hierarchy.md @@ -0,0 +1 @@ +--8<-- "README.md:104:182" diff --git a/docs/index.md b/docs/index.md index 612c7a5..7f5d635 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1 +1,2 @@ ---8<-- "README.md" +--8<-- "README.md::20" +--8<-- "README.md:52" diff --git a/mkdocs.yml b/mkdocs.yml index 5635c5b..1959008 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -19,12 +19,15 @@ theme: nav: - 📖 Introduction: 'index.md' + - 🗃️ Project structure: + - 🗂️ Directory Hierarchy: 'directory_hierarchy.md' + - 🗄️ Data Schema: 'data_schema.md' - 🔑 Setup Tokens: 'setup_tokens.md' - - ⚙️ Pre-commit configuration: 'pre-commit.md' + - ⚙️ Pre-commit configuration: 'pre-commit.md' - 🚀 Github Actions: - Pre-commit_autoupdate: 'github_actions/gh_action_pre-commit-autoupdate.md' - 💻 VSCode configuration: 'vscode.md' - - 🏠 Local Dev setup: 'local_setup.md' + - 🛠️ Local Dev setup: 'local_setup.md' markdown_extensions: - pymdownx.snippets: @@ -39,3 +42,4 @@ markdown_extensions: pygments_lang_class: true - pymdownx.inlinehilite - pymdownx.superfences + - tables diff --git a/{{cookiecutter.repo_name}}/.code_quality/bandit.yaml b/{{cookiecutter.repo_name}}/.code_quality/bandit.yaml deleted file mode 100644 index 5e38b72..0000000 --- a/{{cookiecutter.repo_name}}/.code_quality/bandit.yaml +++ /dev/null @@ -1,404 +0,0 @@ - -### Bandit config file: - -### This config may optionally select a subset of tests to run or skip by -### filling out the 'tests' and 'skips' lists given below. If no tests are -### specified for inclusion then it is assumed all tests are desired. The skips -### set will remove specific tests from the include set. This can be controlled -### using the -t/-s CLI options. Note that the same test ID should not appear -### in both 'tests' and 'skips', this would be nonsensical and is detected by -### Bandit at runtime. - -# Available tests: -# B101 : assert_used -# B102 : exec_used -# B103 : set_bad_file_permissions -# B104 : hardcoded_bind_all_interfaces -# B105 : hardcoded_password_string -# B106 : hardcoded_password_funcarg -# B107 : hardcoded_password_default -# B108 : hardcoded_tmp_directory -# B110 : try_except_pass -# B112 : try_except_continue -# B201 : flask_debug_true -# B301 : pickle -# B302 : marshal -# B303 : md5 -# B304 : ciphers -# B305 : cipher_modes -# B306 : mktemp_q -# B307 : eval -# B308 : mark_safe -# B309 : httpsconnection -# B310 : urllib_urlopen -# B311 : random -# B312 : telnetlib -# B313 : xml_bad_cElementTree -# B314 : xml_bad_ElementTree -# B315 : xml_bad_expatreader -# B316 : xml_bad_expatbuilder -# B317 : xml_bad_sax -# B318 : xml_bad_minidom -# B319 : xml_bad_pulldom -# B320 : xml_bad_etree -# B321 : ftplib -# B322 : input -# B323 : unverified_context -# B324 : hashlib_new_insecure_functions -# B325 : tempnam -# B401 : import_telnetlib -# B402 : import_ftplib -# B403 : import_pickle -# B404 : import_subprocess -# B405 : import_xml_etree -# B406 : import_xml_sax -# B407 : import_xml_expat -# B408 : import_xml_minidom -# B409 : import_xml_pulldom -# B410 : import_lxml -# B411 : import_xmlrpclib -# B412 : import_httpoxy -# B413 : import_pycrypto -# B501 : request_with_no_cert_validation -# B502 : ssl_with_bad_version -# B503 : ssl_with_bad_defaults -# B504 : ssl_with_no_version -# B505 : weak_cryptographic_key -# B506 : yaml_load -# B507 : ssh_no_host_key_verification -# B601 : paramiko_calls -# B602 : subprocess_popen_with_shell_equals_true -# B603 : subprocess_without_shell_equals_true -# B604 : any_other_function_with_shell_equals_true -# B605 : start_process_with_a_shell -# B606 : start_process_with_no_shell -# B607 : start_process_with_partial_path -# B608 : hardcoded_sql_expressions -# B609 : linux_commands_wildcard_injection -# B610 : django_extra_used -# B611 : django_rawsql_used -# B701 : jinja2_autoescape_false -# B702 : use_of_mako_templates -# B703 : django_mark_safe - -exclude_dirs: - - 'venv' - - 'env' - - 'build' - - 'dist' - - 'migrations' - -# (optional) list included test IDs here, eg '[B101, B406]': -tests: - -# (optional) list skipped test IDs here, eg '[B101, B406]': -skips: - -### (optional) plugin settings - some test plugins require configuration data -### that may be given here, per-plugin. All bandit test plugins have a built in -### set of sensible defaults and these will be used if no configuration is -### provided. It is not necessary to provide settings for every (or any) plugin -### if the defaults are acceptable. -assert_used: - skips: ['*test_*.py'] - -any_other_function_with_shell_equals_true: - no_shell: - - os.execl - - os.execle - - os.execlp - - os.execlpe - - os.execv - - os.execve - - os.execvp - - os.execvpe - - os.spawnl - - os.spawnle - - os.spawnlp - - os.spawnlpe - - os.spawnv - - os.spawnve - - os.spawnvp - - os.spawnvpe - - os.startfile - shell: - - os.system - - os.popen - - os.popen2 - - os.popen3 - - os.popen4 - - popen2.popen2 - - popen2.popen3 - - popen2.popen4 - - popen2.Popen3 - - popen2.Popen4 - - commands.getoutput - - commands.getstatusoutput - subprocess: - - subprocess.Popen - - subprocess.call - - subprocess.check_call - - subprocess.check_output - - subprocess.run -hardcoded_tmp_directory: - tmp_dirs: - - /tmp - - /var/tmp - - /dev/shm -linux_commands_wildcard_injection: - no_shell: - - os.execl - - os.execle - - os.execlp - - os.execlpe - - os.execv - - os.execve - - os.execvp - - os.execvpe - - os.spawnl - - os.spawnle - - os.spawnlp - - os.spawnlpe - - os.spawnv - - os.spawnve - - os.spawnvp - - os.spawnvpe - - os.startfile - shell: - - os.system - - os.popen - - os.popen2 - - os.popen3 - - os.popen4 - - popen2.popen2 - - popen2.popen3 - - popen2.popen4 - - popen2.Popen3 - - popen2.Popen4 - - commands.getoutput - - commands.getstatusoutput - subprocess: - - subprocess.Popen - - subprocess.call - - subprocess.check_call - - subprocess.check_output - - subprocess.run -ssl_with_bad_defaults: - bad_protocol_versions: - - PROTOCOL_SSLv2 - - SSLv2_METHOD - - SSLv23_METHOD - - PROTOCOL_SSLv3 - - PROTOCOL_TLSv1 - - SSLv3_METHOD - - TLSv1_METHOD -ssl_with_bad_version: - bad_protocol_versions: - - PROTOCOL_SSLv2 - - SSLv2_METHOD - - SSLv23_METHOD - - PROTOCOL_SSLv3 - - PROTOCOL_TLSv1 - - SSLv3_METHOD - - TLSv1_METHOD -start_process_with_a_shell: - no_shell: - - os.execl - - os.execle - - os.execlp - - os.execlpe - - os.execv - - os.execve - - os.execvp - - os.execvpe - - os.spawnl - - os.spawnle - - os.spawnlp - - os.spawnlpe - - os.spawnv - - os.spawnve - - os.spawnvp - - os.spawnvpe - - os.startfile - shell: - - os.system - - os.popen - - os.popen2 - - os.popen3 - - os.popen4 - - popen2.popen2 - - popen2.popen3 - - popen2.popen4 - - popen2.Popen3 - - popen2.Popen4 - - commands.getoutput - - commands.getstatusoutput - subprocess: - - subprocess.Popen - - subprocess.call - - subprocess.check_call - - subprocess.check_output - - subprocess.run -start_process_with_no_shell: - no_shell: - - os.execl - - os.execle - - os.execlp - - os.execlpe - - os.execv - - os.execve - - os.execvp - - os.execvpe - - os.spawnl - - os.spawnle - - os.spawnlp - - os.spawnlpe - - os.spawnv - - os.spawnve - - os.spawnvp - - os.spawnvpe - - os.startfile - shell: - - os.system - - os.popen - - os.popen2 - - os.popen3 - - os.popen4 - - popen2.popen2 - - popen2.popen3 - - popen2.popen4 - - popen2.Popen3 - - popen2.Popen4 - - commands.getoutput - - commands.getstatusoutput - subprocess: - - subprocess.Popen - - subprocess.call - - subprocess.check_call - - subprocess.check_output - - subprocess.run -start_process_with_partial_path: - no_shell: - - os.execl - - os.execle - - os.execlp - - os.execlpe - - os.execv - - os.execve - - os.execvp - - os.execvpe - - os.spawnl - - os.spawnle - - os.spawnlp - - os.spawnlpe - - os.spawnv - - os.spawnve - - os.spawnvp - - os.spawnvpe - - os.startfile - shell: - - os.system - - os.popen - - os.popen2 - - os.popen3 - - os.popen4 - - popen2.popen2 - - popen2.popen3 - - popen2.popen4 - - popen2.Popen3 - - popen2.Popen4 - - commands.getoutput - - commands.getstatusoutput - subprocess: - - subprocess.Popen - - subprocess.call - - subprocess.check_call - - subprocess.check_output - - subprocess.run -subprocess_popen_with_shell_equals_true: - no_shell: - - os.execl - - os.execle - - os.execlp - - os.execlpe - - os.execv - - os.execve - - os.execvp - - os.execvpe - - os.spawnl - - os.spawnle - - os.spawnlp - - os.spawnlpe - - os.spawnv - - os.spawnve - - os.spawnvp - - os.spawnvpe - - os.startfile - shell: - - os.system - - os.popen - - os.popen2 - - os.popen3 - - os.popen4 - - popen2.popen2 - - popen2.popen3 - - popen2.popen4 - - popen2.Popen3 - - popen2.Popen4 - - commands.getoutput - - commands.getstatusoutput - subprocess: - - subprocess.Popen - - subprocess.call - - subprocess.check_call - - subprocess.check_output - - subprocess.run -subprocess_without_shell_equals_true: - no_shell: - - os.execl - - os.execle - - os.execlp - - os.execlpe - - os.execv - - os.execve - - os.execvp - - os.execvpe - - os.spawnl - - os.spawnle - - os.spawnlp - - os.spawnlpe - - os.spawnv - - os.spawnve - - os.spawnvp - - os.spawnvpe - - os.startfile - shell: - - os.system - - os.popen - - os.popen2 - - os.popen3 - - os.popen4 - - popen2.popen2 - - popen2.popen3 - - popen2.popen4 - - popen2.Popen3 - - popen2.Popen4 - - commands.getoutput - - commands.getstatusoutput - subprocess: - - subprocess.Popen - - subprocess.call - - subprocess.check_call - - subprocess.check_output - - subprocess.run -try_except_continue: - check_typed_exception: false -try_except_pass: - check_typed_exception: false -weak_cryptographic_key: - weak_key_size_dsa_high: 1024 - weak_key_size_dsa_medium: 2048 - weak_key_size_ec_high: 160 - weak_key_size_ec_medium: 224 - weak_key_size_rsa_high: 1024 - weak_key_size_rsa_medium: 2048 diff --git a/{{cookiecutter.repo_name}}/data/README.md b/{{cookiecutter.repo_name}}/data/README.md index 63ba392..af0635c 100644 --- a/{{cookiecutter.repo_name}}/data/README.md +++ b/{{cookiecutter.repo_name}}/data/README.md @@ -4,16 +4,16 @@ layered data-engineering convention ![layeded data engineering](https://docs.kedro.org/en/0.18.6/_images/data_engineering_convention.png) -| ****Folder in data**** | ****Description**** | +| `Folder in data` | `Description` | | ---------------------- | --- | -| ****raw**** | initial start of the pipeline, containing the sourced data model(s) that should never be changed, it forms your single source of truth to work from. these data models are typically un-typed in most cases e.g. csv, but this will vary from case to case | -| ****intermediate**** | optional data model(s), which are introduced to type your raw data model(s), e.g. converting string based values into their current typed representation | -| ****primary**** | domain specific data model(s) containing cleansed, transformed and wrangled data from either raw or intermediate, which forms your layer that you input into your feature engineering | -| ****feature**** | analytics specific data model(s) containing a set of features defined against the primary data, which are grouped by feature area of analysis and stored against a common dimension | -| ****model input**** | analytics specific data model(s) containing all feature data against a common dimension and in the case of live projects against an analytics run date to ensure that you track the historical changes of the features over time | -| ****models**** | stored, serialised pre-trained machine learning models | -| ****model output**** | analytics specific data model(s) containing the results generated by the model based on the model input data | -| ****reporting**** | reporting data model(s) that are used to combine a set of primary, feature, model input and model output data used to drive the dashboard and the views constructed. it encapsulates and removes the need to define any blending or joining of data, improve performance and replacement of presentation layer without having to redefine the data models | +| `raw` | initial start of the pipeline, containing the sourced data model(s) that should never be changed, it forms your single source of truth to work from. these data models are typically un-typed in most cases e.g. csv, but this will vary from case to case | +| `intermediate` | optional data model(s), which are introduced to type your raw data model(s), e.g. converting string based values into their current typed representation | +| `primary` | domain specific data model(s) containing cleansed, transformed and wrangled data from either raw or intermediate, which forms your layer that you input into your feature engineering | +| `feature` | analytics specific data model(s) containing a set of features defined against the primary data, which are grouped by feature area of analysis and stored against a common dimension | +| `model input` | analytics specific data model(s) containing all feature data against a common dimension and in the case of live projects against an analytics run date to ensure that you track the historical changes of the features over time | +| `models` | stored, serialised pre-trained machine learning models | +| `model output` | analytics specific data model(s) containing the results generated by the model based on the model input data | +| `reporting` | reporting data model(s) that are used to combine a set of primary, feature, model input and model output data used to drive the dashboard and the views constructed. it encapsulates and removes the need to define any blending or joining of data, improve performance and replacement of presentation layer without having to redefine the data models | ## References diff --git a/{{cookiecutter.repo_name}}/src/libs/__init__.py b/{{cookiecutter.repo_name}}/src/libs/__init__.py new file mode 100644 index 0000000..eb41cf4 --- /dev/null +++ b/{{cookiecutter.repo_name}}/src/libs/__init__.py @@ -0,0 +1 @@ +"""Source code of your project""" diff --git a/{{cookiecutter.repo_name}}/src/libs/data_etl/__init__.py b/{{cookiecutter.repo_name}}/src/libs/data_etl/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/{{cookiecutter.repo_name}}/src/libs/data_etl/__init__.py @@ -0,0 +1 @@ + diff --git a/{{cookiecutter.repo_name}}/src/libs/data_validation/__init__.py b/{{cookiecutter.repo_name}}/src/libs/data_validation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/feat_cleaning/__init__.py b/{{cookiecutter.repo_name}}/src/libs/feat_cleaning/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/{{cookiecutter.repo_name}}/src/libs/feat_cleaning/__init__.py @@ -0,0 +1 @@ + diff --git a/{{cookiecutter.repo_name}}/src/libs/feat_encoding/__init__.py b/{{cookiecutter.repo_name}}/src/libs/feat_encoding/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/{{cookiecutter.repo_name}}/src/libs/feat_encoding/__init__.py @@ -0,0 +1 @@ + diff --git a/{{cookiecutter.repo_name}}/src/libs/feat_imputation/__init__.py b/{{cookiecutter.repo_name}}/src/libs/feat_imputation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/feat_new_features/__init__.py b/{{cookiecutter.repo_name}}/src/libs/feat_new_features/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/feat_pipelines/__init__.py b/{{cookiecutter.repo_name}}/src/libs/feat_pipelines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/feat_preprocess_strings/__init__.py b/{{cookiecutter.repo_name}}/src/libs/feat_preprocess_strings/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/feat_scaling/__init__.py b/{{cookiecutter.repo_name}}/src/libs/feat_scaling/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/feat_selection/__init__.py b/{{cookiecutter.repo_name}}/src/libs/feat_selection/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/feat_strings/__init__.py b/{{cookiecutter.repo_name}}/src/libs/feat_strings/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/metrics/__init__.py b/{{cookiecutter.repo_name}}/src/libs/metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/model/__init__.py b/{{cookiecutter.repo_name}}/src/libs/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/model_evaluation/__init__.py b/{{cookiecutter.repo_name}}/src/libs/model_evaluation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/model_selection/__init__.py b/{{cookiecutter.repo_name}}/src/libs/model_selection/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/model_validation/__init__.py b/{{cookiecutter.repo_name}}/src/libs/model_validation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/libs/reports/__init__.py b/{{cookiecutter.repo_name}}/src/libs/reports/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/pipelines/__init__.py b/{{cookiecutter.repo_name}}/src/pipelines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/pipelines/data_etl/__init__.py b/{{cookiecutter.repo_name}}/src/pipelines/data_etl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/pipelines/feature_engineering/__init__.py b/{{cookiecutter.repo_name}}/src/pipelines/feature_engineering/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/pipelines/model_evaluation/__init__.py b/{{cookiecutter.repo_name}}/src/pipelines/model_evaluation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/pipelines/model_prediction/__init__.py b/{{cookiecutter.repo_name}}/src/pipelines/model_prediction/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/{{cookiecutter.repo_name}}/src/pipelines/model_train/__init__.py b/{{cookiecutter.repo_name}}/src/pipelines/model_train/__init__.py new file mode 100644 index 0000000..e69de29