Init

mfisher87 · Nov 5, 2023 · f165f1d · f165f1d
commit f165f1d
Show file tree

Hide file tree

Showing 16 changed files with 397 additions and 0 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,98 @@
+# Build, and deploy to either GitHub Pages (production), or Netlify (PR previews)
+name: "Build and deploy"
+
+on:
+  # "Production" deployments run on branch
+  push:
+    branches: ["main"]
+
+  # Preview deployments run on PRs
+  pull_request:
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+
+# Sets permissions of the GITHUB_TOKEN
+permissions:
+  # For GitHub Pages:
+  contents: "read"
+  pages: "write"
+  id-token: "write"
+  # For PR preview comments:
+  pull-requests: "write"
+
+
+# Allow one concurrent deployment
+concurrency:
+  group: "pages"
+  cancel-in-progress: true
+
+
+jobs:
+  # Build job
+  build:
+    runs-on: "ubuntu-latest"
+    steps:
+      - name: "Checkout"
+        uses: "actions/checkout@v3"
+
+      - uses: "quarto-dev/quarto-actions/setup@v2"
+
+      - name: "Render Quarto website"
+        run: "quarto render ."
+
+      - name: "Upload site artifact"
+        uses: "actions/upload-pages-artifact@v1"
+        with:
+          path: "./_site"
+
+
+  # Deploy preview to Netlify IFF this action triggered by PR
+  # Based on: https://github.com/quarto-dev/quarto-web/blob/main/.github/workflows/preview.yml
+  deploy_preview:
+    if: "github.event_name == 'pull_request'"
+    runs-on: "ubuntu-latest"
+    needs: "build"
+    steps:
+      - name: "Download site artifact"
+        uses: "actions/download-artifact@v3"
+        with:
+          # The name of artifacts created by `actions/upload-pages-artifact` is always "github-pages"
+          name: "github-pages"
+          path: "./_site"
+
+      - name: "Untar site artifact"
+        run: "tar --directory ./_site -xvf ./_site/artifact.tar "
+
+      - name: "Deploy preview to Netlify"
+        uses: "nwtgck/actions-netlify@v2"
+        env:
+          NETLIFY_SITE_ID: "${{ secrets.NETLIFY_SITE_ID }}"
+          NETLIFY_AUTH_TOKEN: "${{ secrets.NETLIFY_AUTH_TOKEN }}"
+        with:
+          publish-dir: "./_site"
+          production-deploy: false
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          deploy-message: "Deploy from ${{ github.event.repository.full_name}} GHA: PR ${{ github.event.pull_request.number }} - ${{ github.event.pull_request.title }}"
+          alias: "${{ github.event.repository.name }}-pr-${{ github.event.pull_request.number }}-preview"
+          # these all default to 'true'
+          enable-pull-request-comment: true
+          enable-commit-comment: false
+          enable-commit-status: true
+          overwrites-pull-request-comment: false
+        timeout-minutes: 1
+
+
+  # Deploy to GH Pages IFF this action triggered by push
+  deploy:
+    if: "github.event_name == 'push'"
+    runs-on: "ubuntu-latest"
+    needs: "build"
+    environment:
+      name: "github-pages"
+      url: "${{ steps.deployment.outputs.page_url }}"
+    steps:
+      - name: "Deploy to GitHub Pages"
+        id: "deployment"
+        uses: "actions/deploy-pages@v1"
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+/.quarto/
+/_site/
diff --git a/README.md b/README.md
@@ -0,0 +1,18 @@
+# Infra
+
+Describes (and eventually builds) my home service infrastructure.
+
+> [!NOTE]
+>
+> This is a _work in progress_. TODO:
+>
+> * Much of my service-specific config is in private repositories and needs
+>   to be migrated. I'm not sure yet how I'll organize that config going forward; perhaps
+>   it will all be in this repo? Maybe sub-repos?
+>
+> * I'm optimizing for speed in writing the first pass. I need to fill in details like
+>   model numbers, complete specs, etc. later.
+>
+> * What information should be published and what should be withheld?
+>
+> * Create doc structure so storage pools can each have their own page.
diff --git a/_quarto.yml b/_quarto.yml
@@ -0,0 +1,41 @@
+project:
+  type: "website"
+
+
+website:
+  title: "Matt Fisher Infrastructure"
+  site-url: "https://mfisher87.github.io/infra"
+  # site-path: "/infra"  # Needed?
+
+  repo-url: "https://github.com/mfisher87/infra"
+  repo-actions:
+    - "edit"
+    - "source"
+    - "issue"
+
+  page-footer:
+    right: "This page is built with [Quarto](https://quarto.org/)."
+    left: "&copy; 2023"
+
+  sidebar:
+    background: "#eee"
+    style: "docked"
+    search: true
+    contents:
+      - text: "Nodes"
+        href: "nodes/index.md"
+        contents: "nodes/*.md"
+      - text: "Services"
+        href: "services/index.md"
+        contents: "services/*.md"
+      - text: "Incidents"
+        href: "incidents/index.md"
+      - text: "Changes"
+        href: "changes/index.md"
+
+
+format:
+  html:
+    theme: "cosmo"
+    css: "styles.css"
+    toc: true
diff --git a/changes/2023-11-incident-response/index.md b/changes/2023-11-incident-response/index.md
@@ -0,0 +1,34 @@
+---
+title: "2023-11: Disk failure incident response"
+description: |
+  New storage was added and a new pool was configured in response to this incident.
+---
+
+## Storage changes
+
+As a result of this incident, the failed 1TB HDD was replaced with a 1TB HDD in storage
+pool 1. A new 4TB HDD was purchesd to replace this drive in pool 1, increasing its
+capacity from 5TB (4TB + 1TB data + 4TB parity) to 8TB (4TB + 4TB data + 4TB parity).
+
+In addition, storage pool 2 was created from in stock 8TB disks to provide migration
+space during recovery.
+
+
+## Deployment changes
+
+The VM which was mistakenly assigned to house its disk on this inappropriate drive will
+need to be recreated from scratch to replace all the services that were running on it.
+
+
+## Other changes
+
+### Labels!
+
+Finding the physical failed drive was far more of a challenge than I wanted it to be. I
+printed labels for each drive bay on the [storage node](/nodes/storage.md) including:
+
+* Internal SATA port the bay is connected to
+* The drive model currently in the bay
+* The capacity of the drive currently in the bay
+* The functional purpose of the drive currently in the bay (e.g. `Pool1 Parity1`, `Pool2
+  Data1`, etc.)
diff --git a/changes/index.md b/changes/index.md
@@ -0,0 +1,4 @@
+---
+title: "Changes"
+listing: default
+---
diff --git a/incidents/2023-10-28-disk-failure/index.md b/incidents/2023-10-28-disk-failure/index.md
@@ -0,0 +1,19 @@
+---
+title: "2023-10-28: Disk failure"
+description: |
+  A 1TB HDD in the [storage node](/nodes/storage.md) failed. This drive was used for VM
+  backups, and (mistakenly) for a services VM disk. This resulted in minor data loss.
+---
+
+This disk was unfortunately unprotected, because I figured it was just VM backups and if
+it died, I could replace it and continue. This was perhaps a risky choice, but I also
+made a big mistake by accidentally storing a live VM disk on this drive.
+
+The data lost was unpushed service configuration changes, and potentially some
+relatively unimportant secrets.
+
+
+## Infrastructure changes
+
+See the related [change document](/changes/2023-11-incident-response/index.md) for more
+details about changes as a result of this incident.
diff --git a/incidents/index.md b/incidents/index.md
@@ -0,0 +1,4 @@
+---
+title: "Incidents"
+listing: default
+---
diff --git a/index.md b/index.md
@@ -0,0 +1,95 @@
+---
+title: "My infrastructure"
+---
+
+## Current state
+
+I have two physical nodes:
+
+1. [Storage](nodes/storage.md): An older build from new parts in a Supermicro chassis with 12
+   hotswap drive bays.
+2. [Compute](nodes/compute.md): A newer build from used eBay parts in a Supermicro
+   chassis with lots of airflow.
+
+
+### Virtualization/containerization
+
+Each of these nodes is running _Proxmox VE_. Software is deployed on VMs, enabling
+deployment of virtual clustered systems.
+
+Software is deployed on containers to the extent possible. Sometimes Docker Swarm,
+sometimes Docker Compose, sometimes Kubernetes.
+
+:::{.callout-note}
+That may not even be a full listing of my deployment types...
+
+TODO: Standardize my deployments!
+:::
+
+
+### Storage
+
+My aim is to use simple solutions that minimize [cognitive
+load](https://mfisher87.github.io/cognitive_load.html) and maximize flexibility.
+
+:::{.callout-note}
+I made a lot of these decisions a long time ago and don't have my full rationale
+anymore. When I find / remember it, update here.
+:::
+
+
+#### Redundancy: _mdadm_
+
+[_mdadm_](https://en.wikipedia.org/wiki/Mdadm) is a utility for managing software RAID.
+I'm using this for operating system drives (mostly SSDs?) to enable servers to survive
+drive failures.
+
+
+#### Redundancy: _SnapRAID_
+
+[_SnapRAID_](https://www.snapraid.it/) is a non-realtime software RAID solution. I'm
+using this for shared storage drives. Parity is calculated and validated (bit-rot
+protection) on a schedule.
+
+I'm also using _UnionFS_ to expose data drives as a unified pool.
+
+##### Rationale
+
+* Data drives can be accessed in isolation of the array (even if the array can't be
+  fully recovered).
+* Data files are hashed to protect from bit rot.
+* Arrays are flexible to change size, increase parity, etc.
+    * Disks with data already on them can be added to the array.
+* Only the disk being accessed will spin up.
+* Between the scheduled parity recalculations, it's possible to "un-delete" files!
+
+
+##### Considerations
+
+* Parity drives must be among the largest in the pool. For now I have two pools until I
+  get more drives and can re-organize them in to one pool.
+
+
+### Network
+
+1GbE
+
+
+## Desired state
+
+### Backups as a service
+
+How to make automated (file-level) backups easier? Time to try out some new tools, e.g.
+Borg, Restic?
+
+
+### Energy efficiency
+
+* How to get nodes to sleep when not in use?
+
+* What services can run on more purpose-built hardware to save energy from e.g. software
+  video encoding?
+
+* How can I integrate SBCs (Raspberry Pi, oDroid N2+, H3+) to save energy?
+
+* How best to monitor energy usage at the node/service level?
diff --git a/nodes/compute.md b/nodes/compute.md
@@ -0,0 +1,13 @@
+---
+title: "Compute node"
+---
+
+The newest node in my infrastructure, this node provides various infrastructure and
+end-user services.
+
+Built in 2020 (?) from used parts bought on eBay and parts I had in stock.
+
+
+## Storage
+
+OS and VM Disk storage is on dual SSDs in an _mdadm_ mirrored array.
diff --git a/nodes/index.md b/nodes/index.md
@@ -0,0 +1,5 @@
+---
+title: "Nodes"
+listing:
+  type: table
+---
diff --git a/nodes/storage.md b/nodes/storage.md
@@ -0,0 +1,44 @@
+---
+title: "Storage node"
+---
+
+The first node I set up long ago to act as a NAS, this node provides storage services,
+but also runs some end-user services.
+
+Originally built maybe a decade ago from new parts and used parts I had in stock.
+Upgraded over time and eventually migrated in to a Supermicro 12-bay hotswap chassis.
+
+
+## Storage
+
+A total of 14 ports (6 motherboard SATA + 8 SAS expander PCI card). 12 are exposed as
+hot-swap drive bays.
+
+Hot-swap drives are largely exposed as _UnionFS_ drive pools, with parity provided by
+SnapRAID.
+
+
+### Pool 1
+
+3x 4TB HDDs. 2 data disks totaling 8TB of storage. 1 parity disk (4TB).
+
+
+### Pool 2
+
+3x 8TB HDDs. 2 data disks totaling 16TB of storage. 1 parity disk (8TB).
+
+
+### Considerations
+
+When I add more disks, consider combining the pools. The current set up is to compromise
+between maximizing available storage and tolerance for drive failures.
+
+Pros:
+
+* Can tolerate 1 drive failure, 2 if I'm lucky and they happen on separate pools.
+* 24TB of storage available.
+
+Cons:
+
+* If two drives on the same pool fail, I'm in trouble. Combining the pool and having two
+  parity drives would allow two arbitrary drives to fail.
diff --git a/services/end-user.md b/services/end-user.md
@@ -0,0 +1,6 @@
+---
+title: "End-user services"
+---
+
+Services which are accessed directly by end-users, e.g. _HomeAssistant_, wiki,
+dashboards, etc.