From a13581b22df82ec0dad5876f4152e0a0e4a66010 Mon Sep 17 00:00:00 2001 From: "Jose R. Zapata" Date: Wed, 27 Mar 2024 20:48:20 -0500 Subject: [PATCH] =?UTF-8?q?=F0=9F=8D=AA=F0=9F=93=9D=20update=20documentati?= =?UTF-8?q?on=20with=20directory=20and=20data=20structure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/data_schema.md | 1 + docs/directory_hierarchy.md | 1 + docs/index.md | 3 ++- mkdocs.yml | 8 ++++++-- {{cookiecutter.repo_name}}/data/README.md | 18 +++++++++--------- 5 files changed, 19 insertions(+), 12 deletions(-) create mode 100644 docs/data_schema.md create mode 100644 docs/directory_hierarchy.md diff --git a/docs/data_schema.md b/docs/data_schema.md new file mode 100644 index 0000000..b6209db --- /dev/null +++ b/docs/data_schema.md @@ -0,0 +1 @@ +--8<-- "{{cookiecutter.repo_name}}/data/README.md" diff --git a/docs/directory_hierarchy.md b/docs/directory_hierarchy.md new file mode 100644 index 0000000..1110ce7 --- /dev/null +++ b/docs/directory_hierarchy.md @@ -0,0 +1 @@ +--8<-- "README.md:104:182" diff --git a/docs/index.md b/docs/index.md index 612c7a5..7f5d635 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1 +1,2 @@ ---8<-- "README.md" +--8<-- "README.md::20" +--8<-- "README.md:52" diff --git a/mkdocs.yml b/mkdocs.yml index 5635c5b..1959008 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -19,12 +19,15 @@ theme: nav: - 📖 Introduction: 'index.md' + - 🗃️ Project structure: + - 🗂️ Directory Hierarchy: 'directory_hierarchy.md' + - 🗄️ Data Schema: 'data_schema.md' - 🔑 Setup Tokens: 'setup_tokens.md' - - ⚙️ Pre-commit configuration: 'pre-commit.md' + - ⚙️ Pre-commit configuration: 'pre-commit.md' - 🚀 Github Actions: - Pre-commit_autoupdate: 'github_actions/gh_action_pre-commit-autoupdate.md' - 💻 VSCode configuration: 'vscode.md' - - 🏠 Local Dev setup: 'local_setup.md' + - 🛠️ Local Dev setup: 'local_setup.md' markdown_extensions: - pymdownx.snippets: @@ -39,3 +42,4 @@ markdown_extensions: pygments_lang_class: true - pymdownx.inlinehilite - pymdownx.superfences + - tables diff --git a/{{cookiecutter.repo_name}}/data/README.md b/{{cookiecutter.repo_name}}/data/README.md index 63ba392..af0635c 100644 --- a/{{cookiecutter.repo_name}}/data/README.md +++ b/{{cookiecutter.repo_name}}/data/README.md @@ -4,16 +4,16 @@ layered data-engineering convention ![layeded data engineering](https://docs.kedro.org/en/0.18.6/_images/data_engineering_convention.png) -| ****Folder in data**** | ****Description**** | +| `Folder in data` | `Description` | | ---------------------- | --- | -| ****raw**** | initial start of the pipeline, containing the sourced data model(s) that should never be changed, it forms your single source of truth to work from. these data models are typically un-typed in most cases e.g. csv, but this will vary from case to case | -| ****intermediate**** | optional data model(s), which are introduced to type your raw data model(s), e.g. converting string based values into their current typed representation | -| ****primary**** | domain specific data model(s) containing cleansed, transformed and wrangled data from either raw or intermediate, which forms your layer that you input into your feature engineering | -| ****feature**** | analytics specific data model(s) containing a set of features defined against the primary data, which are grouped by feature area of analysis and stored against a common dimension | -| ****model input**** | analytics specific data model(s) containing all feature data against a common dimension and in the case of live projects against an analytics run date to ensure that you track the historical changes of the features over time | -| ****models**** | stored, serialised pre-trained machine learning models | -| ****model output**** | analytics specific data model(s) containing the results generated by the model based on the model input data | -| ****reporting**** | reporting data model(s) that are used to combine a set of primary, feature, model input and model output data used to drive the dashboard and the views constructed. it encapsulates and removes the need to define any blending or joining of data, improve performance and replacement of presentation layer without having to redefine the data models | +| `raw` | initial start of the pipeline, containing the sourced data model(s) that should never be changed, it forms your single source of truth to work from. these data models are typically un-typed in most cases e.g. csv, but this will vary from case to case | +| `intermediate` | optional data model(s), which are introduced to type your raw data model(s), e.g. converting string based values into their current typed representation | +| `primary` | domain specific data model(s) containing cleansed, transformed and wrangled data from either raw or intermediate, which forms your layer that you input into your feature engineering | +| `feature` | analytics specific data model(s) containing a set of features defined against the primary data, which are grouped by feature area of analysis and stored against a common dimension | +| `model input` | analytics specific data model(s) containing all feature data against a common dimension and in the case of live projects against an analytics run date to ensure that you track the historical changes of the features over time | +| `models` | stored, serialised pre-trained machine learning models | +| `model output` | analytics specific data model(s) containing the results generated by the model based on the model input data | +| `reporting` | reporting data model(s) that are used to combine a set of primary, feature, model input and model output data used to drive the dashboard and the views constructed. it encapsulates and removes the need to define any blending or joining of data, improve performance and replacement of presentation layer without having to redefine the data models | ## References