diff --git a/.github/workflows/add_issues_to_project.yml b/.github/workflows/add_issues_to_project.yml new file mode 100644 index 00000000..b1f07aac --- /dev/null +++ b/.github/workflows/add_issues_to_project.yml @@ -0,0 +1,14 @@ +on: + issues: + types: + - opened + +jobs: + add-to-project: + name: Add issue to project + runs-on: ubuntu-latest + steps: + - uses: actions/add-to-project@RELEASE_VERSION + with: + project-url: https://github.com/orgs/ooni/projects/31 + github-token: ${{ secrets.ADD_TO_PROJECT_GH_TOKEN }} diff --git a/README.md b/README.md index 5d2398e8..fa9b8c7a 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,26 @@ # OONI Devops -## Infrastructure Tiers - -We divide our infrastructure components into 3 tiers: - -- **Tier 0: Critical**: These are mission critical infrastructure components. If these become unavailable or have significant disruption, it will have a major impact. - -- **Tier 1: Essential**: These components are important, but not as critical as - tier 0. They are part of our core operations, but if they become unavailable - the impact is important, but not major. - -- **Tier 2: Non-Essential**: These are auxiliary components. Their - unavailability does not have a major impact. - -### Tier 0 (Critical) components - -- [ ] Probe Services (collector specifically) -- [ ] Fastpath (part responsible for storing post-cans) -- [x] DNS configuration -- [ ] Monitoring -- [ ] OONI bridges -- [ ] OONI.org website -- [x] Web Connectivity test helpers -- [ ] Code signing - -### Tier 1 (Essential) components - -- [ ] OONI API measurement listing -- [x] OONI Explorer -- [x] OONI Run -- [ ] OONI Data analysis pipeline -- [ ] OONI Findings API -- [x] Website analytics - -### Tier 2 (Non-Essential) components - -- [ ] Test list editor -- [ ] Jupyter notebooks -- [ ] Countly +At a glance below is the overall architecture of OONI Infrastructure across our various locations: + +```mermaid +flowchart TB + apiorg([api.ooni.org])-->alb + apiio([api.ooni.io])-->backend + ecs[Backend API ECS]<-->ch[(Clickhouse Cluster)] + subgraph Hetzner + backend[OONI Backend Monolith]<-->ch + monitoring[Monitoring host] + pipeline[Pipeline v5] + end + subgraph AWS + alb[API Load Balancer]<-->ecs + alb-->backend + ecs<-->s3[(OONI S3 Buckets)] + s3<-->backend + end + subgraph Digital Ocean + th[Web Connectivity Test helper]<-->alb + end +``` + +For more details [Infrastructure docs](https://docs.ooni.org/devops/infrastructure/) diff --git a/ansible/README.md b/ansible/README.md index 60da1de5..8b220965 100644 --- a/ansible/README.md +++ b/ansible/README.md @@ -1,4 +1,14 @@ -### Quickstart +# Ansible + +**NOTE** We are currently in the process of migrating ansible configurations from [ooni/sysadmin](https://github.com/ooni/sysadmin) to [ooni/devops](https://github.com/ooni/devops). + +Ansible is used to configure the OSes on long term provisioned backend hosts and manage the configuration for these components. + +For example ansible will be used to configure the setup of VPSs and dedicated hosts that are provisioned manually or using terraform. + +In the case of hosts that are continously delivered, we instead use cloud-native configuration management tools. + +## Installation and setup It's recommended to make use of a virtualenv, for example managed using `pyenv virtualenv`: ``` @@ -6,41 +16,173 @@ pyenv virtualenv ooni-devops pyenv activate ooni-devops ``` -Install deps: +### Ansible setup + +You should then install the required python and ansible-galaxy depedencies with: ``` -pip install ansible dnspython boto3 passlib +pip install -r requirements/python.yml +ansible-galaxy install -r requirements/ansible-galaxy.yml ``` -Install ansible galaxy modules: +In order to gain access to machines you will have to add your public key to the +`ssh_users` variable inside of `ansible/group_vars/all/vars.yml`. + +It's recommended you generate an `ed25519` key using the following command: ``` -ansible-galaxy install -r requirements.yml +ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519_ooni ``` -Setup AWS credentials, you should add 2 profiles called `oonidevops_user_dev` and `oonidevops_user_prod` which have access to the development and production environment respectively +### AWS configuration + +Refer to the [terraform docs](devops/terraform/) for setting up your AWS configuration. + +### SSH configuration + +You should configure your `~/.ssh/config` with the following: ``` -[oonidevops_user_dev] -aws_access_key_id = XXX -aws_secret_access_key = YYY -source_profile = default -region = eu-central-1 -# ARN of the dev role -role_arn = arn:aws:iam::905418398257:role/oonidevops + IdentitiesOnly yes + ServerAliveInterval 120 + UserKnownHostsFile ~/.ssh/known_hosts ~/REPLACE_ME/sysadmin/ext/known_hosts + + host *.ooni.io + user YOUR_USERNAME + + host *.ooni.nu + user YOUR_USERNAME -[oonidevops_user_prod] -aws_access_key_id = XXX -aws_secret_access_key = YYY -source_profile = default -region = eu-central-1 -# ARN of the prod role -role_arn = arn:aws:iam::471112720364:role/oonidevops + host *.ooni.org + user YOUR_USERNAME ``` -Run playbook: +**TODO** restore ext/known_hosts setup + +Replace `~/REPLACE_ME/sysadmin/ext/known_hosts` to where you have cloned +the `ooni/sysadmin` repo. This will ensure you use the host key +fingeprints from this repo instead of just relying on TOFU. + +You should replace `YOUR_USERNAME` with your username from `adm_login`. + +On MacOS you may want to also add: + + host * + UseKeychain yes + +To use the Keychain to store passwords. + +## Running ansible playbooks + +Playbooks are run via an wrapper script called `./play` which notifies the slack #ooni-bots channel that a deployment has been triggered. + ``` -ansible-playbook playbook.yml -i inventory +./play -i inventory deploy-.yml -l --diff -C +./play -i inventory deploy-.yml -l --diff ``` +:::caution +any minor error in configuration files or ansible's playbooks can be +destructive for the backend infrastructure. Always test-run playbooks +with `--diff` and `-C` at first and carefully verify configuration +changes. After verification run the playbook without `-C` and verify +again the applied changes. +::: + +:::note +[Etckeeper](#etckeeper) 🔧 can be useful to verify configuration +changes from a different point of view. +::: + +In general there are two classes of playbooks: +* Those starting with `deploy-*.yml`, which are used to deploy specific components or pieces of components related to OONI infrastructure. All of these playbooks are included inside of `playbook.yml` to faciliate testing and ensuring that every component in our infrastucture is fully deployable. +* Those starting with `playbook-*` which are playbooks for specific tasks that may not be part of the main infrastructure deployment (eg. bootstrapping nodes once upon creation, creating snapshots of remote configurations, etc.) + +Some notable playbooks or roles are: + +The bootstrap playbook is in `playbook-bootstrap.yml` and is a playbook that should be run once when a new host is created. + +The nftables firewall is configured to read every `.nft` file under +`/etc/ooni/nftables/` and `/etc/ooni/nftables/`. This allows roles to +create small files to open a port each and keep the configuration as +close as possible to the ansible step that deploys a service. See this in use inside of the `nftables` role. + +#### The root account + +Runbooks use ssh to log on the hosts using your own account and leveraging `sudo` to act as root. + +The only exception is when a new host is being deployed - in that case ansible will log in as root to create +individual accounts and lock out the root user. + +When running the entire runbook ansible might try to run it as root. +This can be avoided by selecting only the required tags using `-t `. + +Ideally the root user should be disabled after succesfully creating user accounts. + +#### Roles layout + +Ansible playbooks use multiple roles (see +[example](https://github.com/ooni/sysadmin/blob/master/ansible/deploy-backend.yml#L46)) +to deploy various components. + +Few roles use the `meta/main.yml` file to depend on other roles. See +[example](https://github.com/ooni/sysadmin/blob/master/ansible/roles/ooni-backend/meta/main.yml) + +:::note +The latter method should be used sparingly because ansible does not +indicate where each task in a playbook is coming from. Moreover if a dependencies is specified twice inside of two roles, it will run twice. +::: + +A diagram of the role dependencies for the deploy-backend.yml playbook: + +```mermaid + +flowchart LR + A(deploy-backend.yml) --> B(base-bullseye) + B -- meta --> G(adm) + A --> F(nftables) + A --> C(nginx-buster) + A --> D(dehydrated) + D -- meta --> C + E -- meta --> F + A --> E(ooni-backend) + style B fill:#eeffee + style C fill:#eeffee + style D fill:#eeffee + style E fill:#eeffee + style F fill:#eeffee + style G fill:#eeffee +``` + +A similar diagram for deploy-monitoring.yml: + +```mermaid + +flowchart LR + B -- meta --> G(adm) + M(deploy-monitoring.yml) --> B(base-bookworm) + M --> O(ooca-cert) + M --> F(nftables) + M --> D(dehydrated) -- meta --> N(nginx-buster) + M --> P(prometheus) + M --> X(blackbox-exporter) + M --> T(alertmanager) + style B fill:#eeffee + style D fill:#eeffee + style F fill:#eeffee + style G fill:#eeffee + style N fill:#eeffee + style O fill:#eeffee + style P fill:#eeffee + style T fill:#eeffee + style X fill:#eeffee +``` + +:::note +When deploying files or updating files already existing on the hosts it can be useful to add a note e.g. "Deployed by ansible, see ". +This helps track down how files on the host were modified and why. +::: + +### Platform specific known bugs + On macOS you might run into this issue: https://github.com/ansible/ansible/issues/76322 The current workaround is to export the following environment variable before running ansible: diff --git a/ansible/deploy-airflow.yml b/ansible/deploy-airflow.yml new file mode 100644 index 00000000..ebf34e4a --- /dev/null +++ b/ansible/deploy-airflow.yml @@ -0,0 +1,9 @@ +--- +- name: Deploy airflow frontend host + hosts: + - data1.htz-fsn.prod.ooni.nu + become: true + roles: + - oonidata_airflow + vars: + airflow_public_fqdn: "airflow.prod.ooni.io" diff --git a/ansible/deploy-bootstrap.yml b/ansible/deploy-bootstrap.yml new file mode 100644 index 00000000..81dccab1 --- /dev/null +++ b/ansible/deploy-bootstrap.yml @@ -0,0 +1,7 @@ +- name: Ensure all hosts are bootstrapped correctly + hosts: all + become: yes + roles: + - bootstrap + tags: + - bootstrap diff --git a/ansible/deploy-clickhouse.yml b/ansible/deploy-clickhouse.yml new file mode 100644 index 00000000..c2d34cc7 --- /dev/null +++ b/ansible/deploy-clickhouse.yml @@ -0,0 +1,13 @@ +--- +- name: Deploy oonidata clickhouse hosts + hosts: + - notebook.ooni.org + - data1.htz-fsn.prod.ooni.nu + #- data2.htz-fsn.prod.ooni.nu + - data3.htz-fsn.prod.ooni.nu + become: true + tags: + - clickhouse + roles: + - prometheus_node_exporter + - oonidata_clickhouse diff --git a/ansible/playbook-controller.yml b/ansible/deploy-controller.yml similarity index 100% rename from ansible/playbook-controller.yml rename to ansible/deploy-controller.yml diff --git a/ansible/deploy-monitoring-config.yml b/ansible/deploy-monitoring-config.yml new file mode 100644 index 00000000..0c27527e --- /dev/null +++ b/ansible/deploy-monitoring-config.yml @@ -0,0 +1,10 @@ +--- +- name: Update monitoring config + hosts: monitoring.ooni.org + become: true + tags: + - monitoring + roles: + - prometheus + - prometheus_blackbox_exporter + - prometheus_alertmanager diff --git a/ansible/deploy-monitoring.yml b/ansible/deploy-monitoring.yml new file mode 100644 index 00000000..2fd77466 --- /dev/null +++ b/ansible/deploy-monitoring.yml @@ -0,0 +1,12 @@ +--- +- name: Deploy monitoring host + hosts: monitoring.ooni.org + become: true + tags: + - monitoring + roles: + - monitoring + vars: + monitoring_htpasswd: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/monitoring_htpasswd', profile='oonidevops_user_prod') }}" + +- ansible.builtin.import_playbook: deploy-monitoring-config.yml diff --git a/ansible/deploy-ooni-backend.yml b/ansible/deploy-ooni-backend.yml new file mode 100644 index 00000000..24c70aac --- /dev/null +++ b/ansible/deploy-ooni-backend.yml @@ -0,0 +1,21 @@ +--- +- hosts: backend-hel.ooni.org + roles: + - role: bootstrap + - role: base-backend + - role: nftables + - role: nginx + tags: nginx + vars: + nginx_user: "www-data" + - role: dehydrated + tags: dehydrated + expand: yes + vars: + ssl_domains: + # with dehydrated the first entry is the cert FQDN + # and the other ones are alternative names + - "backend-hel.ooni.org" + - role: ooni-backend + vars: + ssl_domain: backend-hel.ooni.org diff --git a/ansible/deploy-tier0.yml b/ansible/deploy-tier0.yml new file mode 100644 index 00000000..3657d544 --- /dev/null +++ b/ansible/deploy-tier0.yml @@ -0,0 +1,12 @@ +--- +- name: Include monitoring playbook + ansible.builtin.import_playbook: deploy-monitoring.yml + +- name: Include ooni-backend playbook + ansible.builtin.import_playbook: deploy-ooni-backend.yml + +- name: Include clickhouse playbook + ansible.builtin.import_playbook: deploy-clickhouse.yml + +- name: Include airflow playbook + ansible.builtin.import_playbook: deploy-airflow.yml diff --git a/ansible/deploy-tier2.yml b/ansible/deploy-tier2.yml new file mode 100644 index 00000000..8f87a663 --- /dev/null +++ b/ansible/deploy-tier2.yml @@ -0,0 +1,25 @@ +--- +- name: Setup OpenVPN server + hosts: openvpn-server1.ooni.io + become: true + remote_user: root + roles: + - ssh_users + +- name: Deploy notebook host + hosts: notebook.ooni.org + become: true + tags: + - notebook + vars: + enable_oonipipeline_worker: false + roles: + - oonidata + +# commented out due to the fact it requires manual config of ~/.ssh/config +#- name: Setup codesign box +# hosts: codesign-box +# become: true +# remote_user: ubuntu +# roles: +# - codesign_box diff --git a/ansible/group_vars/airflow/vars.yml b/ansible/group_vars/airflow/vars.yml new file mode 100644 index 00000000..3fb68ebe --- /dev/null +++ b/ansible/group_vars/airflow/vars.yml @@ -0,0 +1,12 @@ +airflow_admin_users: + - name: OONI Admin + username: admin + password: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/airflow_admin_password', profile='oonidevops_user_prod') }}" + role: Admin + firstname: Open + lastname: Observatory + email: admin@ooni.org +airflow_fernet_key: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/airflow_fernet_key', profile='oonidevops_user_prod') }}" +airflow_webserver_secret_key: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/airflow_webserver_secret_key', profile='oonidevops_user_prod') }}" +airflow_executor: "LocalExecutor" +airflow_database_conn: "postgresql+psycopg2://airflow:{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/airflow_postgresql_password', profile='oonidevops_user_prod') }}@ooni-tier0-postgres.c7mgscca82no.eu-central-1.rds.amazonaws.com/airflow" diff --git a/ansible/group_vars/all/vars.yml b/ansible/group_vars/all/vars.yml index 936fd374..b96b60fa 100644 --- a/ansible/group_vars/all/vars.yml +++ b/ansible/group_vars/all/vars.yml @@ -2,11 +2,22 @@ ssh_users: agrabeli: login: agrabeli comment: Maria Xynou - keys: ["ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDD0JSwM+t3Uz9lS3Mjoz9oo4vOToWyzboZhYQbP8JY5HvFtAvWanWHnUBO91t6hkgKIMiUqhdCJn26fqkhSGe/bRBaFUocOmuyfcmZoRdi0qzAskmycJsj/w6vWR4x6MYkmJvSeI/MGxjEFt4s2MfOG1tP8CBLUYft9qUleeJa7Jln8c+xbnqB7YngaI190icQHE9NuIB2CXvzbmo3tLtHNMagEwI7VoBDj6mxzTxBd9JhuhF4w5uGxxm0Gp1hzk+15obNnaBS+Anr7jXz8FPwwxCH+XhBZxB1PPpcIayKrf9iLyGtwmhkdDoWCqYAr1mue3LxFso+TZF4bwE4Cjt1 agrabelh@agrabelh"] + keys: + [ + "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDD0JSwM+t3Uz9lS3Mjoz9oo4vOToWyzboZhYQbP8JY5HvFtAvWanWHnUBO91t6hkgKIMiUqhdCJn26fqkhSGe/bRBaFUocOmuyfcmZoRdi0qzAskmycJsj/w6vWR4x6MYkmJvSeI/MGxjEFt4s2MfOG1tP8CBLUYft9qUleeJa7Jln8c+xbnqB7YngaI190icQHE9NuIB2CXvzbmo3tLtHNMagEwI7VoBDj6mxzTxBd9JhuhF4w5uGxxm0Gp1hzk+15obNnaBS+Anr7jXz8FPwwxCH+XhBZxB1PPpcIayKrf9iLyGtwmhkdDoWCqYAr1mue3LxFso+TZF4bwE4Cjt1 agrabelh@agrabelh", + ] art: login: art comment: Arturo Filasto - keys: ["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJsibU0nsQFFIdolD1POzXOws4VetV0ZNByINRzY8Hx0 arturo@ooni.org"] + keys: + [ + "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJsibU0nsQFFIdolD1POzXOws4VetV0ZNByINRzY8Hx0 arturo@ooni.org", + ] + hynnot: + login: hynnot + comment: Tony Morella + keys: + - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBTiOgr4PenzkF03NqFTGgacZ1BUWLkdCS1xNba4iLfP hynnot" majakomel: login: majakomel comment: Maja Komel @@ -22,8 +33,14 @@ ssh_users: comment: Norbel Ambanumben keys: - "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDBXprrutdT6AhrV9hWBKjyzq6RqGmCBWpWxi3qwJyRcBJfkiEYKV9QWl3H0g/Sg9JzLd9lWG2yfAai7cyBAT4Ih0+OhwQ0V7wkhBn4YkNjs7d4BGPHjuLIywS9VtmiyH7VafikMjmqPLL/uPBIbRrx9RuSfLkAuN9XFZpVmqzWY8ePpcRCvnG6ucPxEY8o+4j5nfTrgxSaIT31kH16/PFJe07tn1SZjxZE4sZTz/p9xKt6s8HXmlP3RdnXSpXWmH8ZwYDrNhkcH8m6mC3giiqSKThFdwvQVflRRvn9pAlUOhy6KIBtAt1KobVJtOCPrrkcLhQ1C+2P9wKhfYspCGrScFGnrUqumLxPpwlqILxJvmgqGAtkm8Ela9f2D9sEv8CUv5x9XptZKlyRhtOLixvLYoJlwfXXnmXa8T1pg8+4063BhHUOu/bg0InpSp3hdscOfk0R8FtDlXnn6COwbPXynIt4PxzIxD/WQhP0ymgH3ky6ClB5wRBVhOqYvxQw32n2QFS9A5ocga+nATiOE7BTOufgmDCA/OIXfJ/GukXRaMCBsvlx7tObHS1LOMt0I+WdoOEjI0ARUrFzwoiTrs9QYmd922e7S35EnheT3JjnCTjebJrCNtwritUy8vjsN/M27wJs7MAXleT7drwXXnm+3xYrH+4KQ+ru0dxMe1zfBw== aanorbel@gmail.com" + luis: + login: luis + comment: Luis Diaz + keys: + - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHc04zv+G8vGOS/znLy6xd3lB0/B07uaFjgyh4UgqUMA luis@openobservatory.org" + +admin_usernames: [art, mehul, luis, hynnot] +root_usernames: [art, mehul, luis, hynnot] +non_admin_usernames: [] -admin_usernames: [ art, majakomel, mehul, norbel ] -root_usernames: [ art, mehul ] -non_admin_usernames: [ agrabeli ] -deactivated_usernames: [ sbs, federico, sarath ] \ No newline at end of file +prometheus_metrics_password: "{{ lookup('amazon.aws.aws_secret', 'oonidevops/ooni_services/prometheus_metrics_password', profile='oonidevops_user_prod') }}" diff --git a/ansible/group_vars/clickhouse/vars.yml b/ansible/group_vars/clickhouse/vars.yml new file mode 100644 index 00000000..f1ac5248 --- /dev/null +++ b/ansible/group_vars/clickhouse/vars.yml @@ -0,0 +1,216 @@ +nftables_clickhouse_allow: + - fqdn: data1.htz-fsn.prod.ooni.nu + ip: 142.132.254.225 + - fqdn: data2.htz-fsn.prod.ooni.nu + ip: 88.198.54.12 + - fqdn: data3.htz-fsn.prod.ooni.nu + ip: 168.119.7.188 + - fqdn: notebook.ooni.org + ip: 138.201.19.39 + - fqdn: clickhouseproxy.dev.ooni.io + ip: "{{ lookup('dig', 'clickhouseproxy.dev.ooni.io/A') }}" + +nftables_zookeeper_allow: + - fqdn: data1.htz-fsn.prod.ooni.nu + ip: 142.132.254.225 + - fqdn: data2.htz-fsn.prod.ooni.nu + ip: 88.198.54.12 + - fqdn: data3.htz-fsn.prod.ooni.nu + ip: 168.119.7.188 + - fqdn: notebook.ooni.org + ip: 138.201.19.39 + +clickhouse_version: 24.8.6.70 + +clickhouse_config: + max_connections: 4096 + keep_alive_timeout: 3 + max_concurrent_queries: 100 + max_server_memory_usage: 21001001000 + max_thread_pool_size: 10000 + max_server_memory_usage_to_ram_ratio: 0.9 + total_memory_profiler_step: 4194304 + total_memory_tracker_sample_probability: 0 + uncompressed_cache_size: 8589934592 + mark_cache_size: 5368709120 + # max_open_files: 262144 + mmap_cache_size: 1000 + compiled_expression_cache_size: 134217728 + compiled_expression_cache_elements_size: 10000 + # tmp_policy: tmp + default_profile: default + custom_settings_prefixes: "" + system_profile: write + # buffer_profile: default + default_database: default + # timezone: + # umask: 027 + mlock_executable: true + remap_executable: true + builtin_dictionaries_reload_interval: 3600 + max_session_timeout: 3600 + default_session_timeout: 60 + # regions_hierarchy_file: /opt/geo/regions_hierarchy.txt + # regions_names_files_path: /opt/geo/ + # top_level_domains_path: /var/lib/clickhouse/top_level_domains/ + # top_level_domains: # Path to the list is under top_level_domains_path + # - domain: + # name: example_name + # path: /path/to/example_name.dat + dictionaries_config: "*_dictionary.xml" + user_defined_executable_functions_config: "*_function.xml" + # max_table_size_to_drop: 0 + # max_partition_size_to_drop: 0 + format_schema_path: /var/lib/clickhouse/format_schemas/ + # disable_internal_dns_cache: 1 + +clickhouse_keeper: + tcp_port: 9181 + log_storage_path: /var/lib/clickhouse/coordination/log + snapshot_storage_path: /var/lib/clickhouse/coordination/snapshots + coordination_settings: + operation_timeout_ms: 10000 + session_timeout_ms: 30000 + raft_logs_level: trace + keeper_servers: + - keeper_server: + server: data1.htz-fsn.prod.ooni.nu + id: 1 + hostname: clickhouse1.prod.ooni.io + port: 9234 + + #- keeper_server: + # server: data2.htz-fsn.prod.ooni.nu + # id: 2 + # hostname: clickhouse2.prod.ooni.io + # port: 9234 + + - keeper_server: + server: data3.htz-fsn.prod.ooni.nu + id: 3 + hostname: clickhouse3.prod.ooni.io + port: 9234 + + - keeper_server: + server: notebook.ooni.org + id: 4 + hostname: notebook.ooni.org + port: 9234 + +clickhouse_zookeeper: + - node: + host: clickhouse1.prod.ooni.io + port: 9181 + - node: + host: clickhouse3.prod.ooni.io + port: 9181 + - node: + host: notebook.ooni.org + port: 9181 + +clickhouse_remote_servers: + - server: + servername: oonidata_cluster + secret: "{{ lookup('amazon.aws.aws_secret', 'oonidevops/clickhouse_oonidata_cluster_secret', profile='oonidevops_user_prod') }}" + shards: + - shard: + internal_replication: true + replicas: + - replica: + host: clickhouse1.prod.ooni.io + port: 9000 + #- replica: + # host: clickhouse2.prod.ooni.io + # port: 9000 + - replica: + host: clickhouse3.prod.ooni.io + port: 9000 + +clickhouse_macros: + - macro: | + 01 + 01 + server: + - data1.htz-fsn.prod.ooni.nu + - macro: | + 01 + 02 + server: + - data2.htz-fsn.prod.ooni.nu + - macro: | + 01 + 03 + server: + - data3.htz-fsn.prod.ooni.nu + - macro: | + oonidata_cluster + +clickhouse_distributed_ddl: + path: "/clickhouse/task_queue/ddl" + profile: "write" + pool_size: 1 + task_max_lifetime: 604800 + cleanup_delay_period: 60 + max_tasks_in_queue: 1000 + +clickhouse_default_profiles: + default: + readonly: 2 + max_memory_usage: 11001001000 + use_uncompressed_cache: 0 + load_balancing: random + max_partitions_per_insert_block: 100 + readonly: + readonly: 1 + write: + readonly: 0 + +clickhouse_listen_hosts: + - "::" + +clickhouse_default_users: + - user: + name: default + password: + networks: + - "127.0.0.1" + profile: default + quota: default + - user: + name: readonly + password_sha256_hex: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/clickhouse_readonly_password', profile='oonidevops_user_prod') | hash('sha256') }}" + networks: + - "0.0.0.0/0" + profile: readonly + quota: default + - user: + name: write + password_sha256_hex: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/clickhouse_write_password', profile='oonidevops_user_prod') | hash('sha256') }}" + networks: + - "0.0.0.0/0" + profile: write + quota: default + +clickhouse_default_quotas: + - quota: + name: default + duration: 3600 + queries: 0 + errors: 0 + result_rows: 0 + read_rows: 0 + execution_time: 0 + +clickhouse_prometheus: + endpoint: "/metrics" + port: 9363 + metrics: true + events: true + asynchronous_metrics: true + status_info: true + +prometheus_nginx_proxy_config: + - location: /metrics/node_exporter + proxy_pass: http://127.0.0.1:8100/metrics + - location: /metrics/clickhouse + proxy_pass: http://127.0.0.1:9363/metrics diff --git a/ansible/group_vars/dev/vars.yml b/ansible/group_vars/dev/vars.yml index a952a5d4..05d78af8 100644 --- a/ansible/group_vars/dev/vars.yml +++ b/ansible/group_vars/dev/vars.yml @@ -1 +1,3 @@ -prometheus_metrics_password: "{{ lookup('amazon.aws.aws_secret', 'oonidevops/ooni_services/prometheus_metrics_password', profile='oonidevops_user_dev') }}" \ No newline at end of file +prometheus_metrics_password: "{{ lookup('amazon.aws.aws_secret', 'oonidevops/ooni_services/prometheus_metrics_password', profile='oonidevops_user_dev') }}" +admin_usernames: [ art, mehul, norbel, majakomel ] +non_admin_usernames: [ agrabeli ] diff --git a/ansible/group_vars/prod/vars.yml b/ansible/group_vars/prod/vars.yml index 0248a20a..b80680bb 100644 --- a/ansible/group_vars/prod/vars.yml +++ b/ansible/group_vars/prod/vars.yml @@ -1 +1,7 @@ -prometheus_metrics_password: "{{ lookup('amazon.aws.aws_secret', 'oonidevops/ooni_services/prometheus_metrics_password', profile='oonidevops_user_prod') }}" \ No newline at end of file +prometheus_metrics_password: "{{ lookup('amazon.aws.aws_secret', 'oonidevops/ooni_services/prometheus_metrics_password', profile='oonidevops_user_prod') }}" +tailscale_authkey: "{{ lookup('amazon.aws.aws_secret', 'oonidevops/tailscale_authkey_devops', profile='oonidevops_user_prod') }}" +tailscale_tags: + - "devops-prod" +tailscale_oauth_ephemeral: false +admin_usernames: [ art, mehul ] +non_admin_usernames: [ ] diff --git a/ansible/host_vars/ams-slack-1.ooni.org b/ansible/host_vars/ams-slack-1.ooni.org new file mode 100644 index 00000000..f6dec97a --- /dev/null +++ b/ansible/host_vars/ams-slack-1.ooni.org @@ -0,0 +1,13 @@ +nft_rules_tcp: + - name: 22 + rules: + - add rule inet filter input tcp dport 22 counter accept comment "Incoming SSH" + - name: 80 + rules: + - add rule inet filter input tcp dport 80 counter accept comment "incoming HTTP" + - name: 443 + rules: + - add rule inet filter input tcp dport 443 counter accept comment "incoming HTTPS" + - name: 9100 + rules: + - add rule inet filter input ip saddr 5.9.112.244 tcp dport 9100 counter accept comment "clickhouse prometheus from monitoring.ooni.org" diff --git a/ansible/host_vars/data3.htz-fsn.prod.ooni.nu b/ansible/host_vars/data3.htz-fsn.prod.ooni.nu new file mode 100644 index 00000000..4af35281 --- /dev/null +++ b/ansible/host_vars/data3.htz-fsn.prod.ooni.nu @@ -0,0 +1,2 @@ +non_admin_usernames: [ ] +clickhouse_base_path: /data/clickhouse diff --git a/ansible/host_vars/data.ooni.org b/ansible/host_vars/notebook.ooni.org similarity index 70% rename from ansible/host_vars/data.ooni.org rename to ansible/host_vars/notebook.ooni.org index 7763cdf7..141d387a 100644 --- a/ansible/host_vars/data.ooni.org +++ b/ansible/host_vars/notebook.ooni.org @@ -54,7 +54,64 @@ ssh_users: [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMJYsbeTjdma5cKyZISOFQfHbwwlZbWugPx9haeOx1UR" ] -admin_usernames: [ art, majakomel, mehul, norbel ] -non_admin_usernames: [ ain, siti, ingrid, joss, vasilis ] + michael: + login: michael + comment: "Micheal Collyer" + keys: + - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPN4Ae+KfZEbhJuvHI3PXjgeu4V0ZFIpUy9bFuBKx76W michael.collyer@oii.ox.ac.uk" + benginoe: + login: benginoe + comment: "Ben Ginoe" + keys: + - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOWdWCATiHUAzoS3mn3pFMIYDmi3n4Ekuzv5cEtvV0W1 root@parrot" + felixhoffmnn: + login: felixhoffmnn + comment: "Felix Hoffmann" + keys: + - "sk-ssh-ed25519@openssh.com AAAAGnNrLXNzaC1lZDI1NTE5QG9wZW5zc2guY29tAAAAIHsT7RNb3xSc4jseb6vPPvC2ORWQHQr66AQR54Vikt/cAAAAB3NzaDpocGk= ssh:hpi" + agix: + login: agix + comment: "Armin Huremagic" + keys: ["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOzCkhJ4DgtReYaR4MVh+FzDw7s2j3v4qBmE+Mpk+igc agix@riseup.net"] + luis: + login: luis + comment: Luis Diaz + keys: + - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHc04zv+G8vGOS/znLy6xd3lB0/B07uaFjgyh4UgqUMA luis@openobservatory.org" + hynnot: + login: hynnot + comment: Tony Morella + keys: + - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBTiOgr4PenzkF03NqFTGgacZ1BUWLkdCS1xNba4iLfP hynnot" + +admin_usernames: [ art, agrabeli, majakomel, mehul, norbel, luis, hynnot ] +non_admin_usernames: [ ain, siti, ingrid, joss, vasilis, michael, benginoe, felixhoffmnn ] jupyterhub_allowed_users: "{{ ssh_users }}" -admin_group_name: adm \ No newline at end of file +admin_group_name: admin + +clickhouse_default_profiles: + default: + readonly: 2 + write: + readonly: 0 + +clickhouse_version: "24.10.2.80" +clickhouse_release_type: stable +clickhouse_listen_hosts: + - "127.0.0.1" + +clickhouse_default_users: + - user: + name: default + password: + networks: + - "127.0.0.1" + profile: default + quota: default + - user: + name: write + password_sha256_hex: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/clickhouse_notebook_write_password', profile='oonidevops_user_prod') | hash('sha256') }}" + networks: + - "127.0.0.1" + profile: write + quota: default diff --git a/ansible/inventory b/ansible/inventory index 1e13c160..6b24654a 100644 --- a/ansible/inventory +++ b/ansible/inventory @@ -1,13 +1,33 @@ -[all] -# This requires manual setup of ~/.ssh/config -#codesign-box +[all:children] +htz_fsn +ghs_ams -[prod] -data.ooni.org -oonidata.ooni.org +## Role tags + +[clickhouse] +notebook.ooni.org +data1.htz-fsn.prod.ooni.nu +data3.htz-fsn.prod.ooni.nu + +[airflow] +data1.htz-fsn.prod.ooni.nu + +## Location tags + +[htz_fsn] monitoring.ooni.org -openvpn-server1.ooni.io notebook.ooni.org +data1.htz-fsn.prod.ooni.nu +data3.htz-fsn.prod.ooni.nu +#backend-fsn.ooni.org -[dev] -oonidatatest.ooni.nu +[htz_hel] +backend-hel.ooni.org + +[ghs_ams] +openvpn-server1.ooni.io +ams-slack-1.ooni.org +ams-ps.ooni.nu +# currently disabled due to them not supporting ed25519 keys +#mia-echoth.ooni.nu +#mia-httpth.ooni.nu diff --git a/ansible/inventory-legacy b/ansible/inventory-legacy index 3988b30d..b14d1f76 100644 --- a/ansible/inventory-legacy +++ b/ansible/inventory-legacy @@ -54,7 +54,7 @@ ams-slack-1.ooni.org # Digital Ocean Amsterdam Hosts [doams] -#doams1-countly.ooni.nu +doams1-countly.ooni.nu # FIXME Disabled due to location tags not working as expected #ams-pg.ooni.org #ams-pg-test.ooni.org diff --git a/ansible/play b/ansible/play new file mode 100755 index 00000000..26aeff3e --- /dev/null +++ b/ansible/play @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -ue + +## ansible-playbook is a wrapper script used to send a notification to slack +# whenever a new ansible deploy is triggered + +ANSIBLE_SLACK_CMD=`printf "%q " "$0" "$@"` +ANSIBLE_SLACK_CMD="${ANSIBLE_SLACK_CMD% }" # strip trailing whitespace +export ANSIBLE_SLACK_CMD + +# Check if --check or -C is present in the arguments +if [[ ! " $* " =~ " --check " && ! " $* " =~ " -C " ]]; then + ansible localhost --module-name include_role --args name=notify-slack +fi +ansible-playbook "$@" diff --git a/ansible/playbook-bootstrap.yml b/ansible/playbook-bootstrap.yml index ab0d34d3..56be3b81 100644 --- a/ansible/playbook-bootstrap.yml +++ b/ansible/playbook-bootstrap.yml @@ -4,5 +4,4 @@ hosts: all remote_user: root roles: - - ssh_users - bootstrap diff --git a/ansible/playbook.yml b/ansible/playbook.yml index dece3fbe..7674acd9 100644 --- a/ansible/playbook.yml +++ b/ansible/playbook.yml @@ -1,57 +1,9 @@ --- -- name: Ensure all hosts are bootstrapped correctly - hosts: all - become: yes - roles: - - bootstrap +- name: Include bootstrap playbook + ansible.builtin.import_playbook: deploy-bootstrap.yml -- name: ClickHouse servers - hosts: clickhouse_servers - user: admin - become: true - vars: - clickhouse_reader_password: "{{ lookup('env', 'CLICKHOUSE_READER_PASSWORD') }}" - roles: - - clickhouse - handlers: - - name: Restart clickhouse-server - ansible.builtin.service: - name: clickhouse-server - state: restarted +- name: Include tier0 playbook + ansible.builtin.import_playbook: deploy-tier0.yml -- name: Update monitoring config - hosts: monitoring.ooni.org - become: true - roles: - - prometheus - - prometheus_blackbox_exporter - - prometheus_alertmanager - -- name: Deploy data.ooni.org host - hosts: data.ooni.org - become: true - roles: - #- clickhouse - - ssh_users - #- jupyterhub - -- name: Setup OpenVPN server - hosts: openvpn-server1.ooni.io - become: true - remote_user: root - roles: - - ssh_users - -- name: Deploy oonidata hosts - hosts: oonidata.ooni.org - become: true - roles: - - oonidata - -# commented out due to the fact it requires manual config of ~/.ssh/config -#- name: Setup codesign box -# hosts: codesign-box -# become: true -# remote_user: ubuntu -# roles: -# - codesign_box +- name: Include tier2 playbook + ansible.builtin.import_playbook: deploy-tier2.yml diff --git a/ansible/requirements.yml b/ansible/requirements.yml deleted file mode 100644 index 3b4d5ae0..00000000 --- a/ansible/requirements.yml +++ /dev/null @@ -1,4 +0,0 @@ -- src: willshersystems.sshd -- src: nginxinc.nginx -- src: geerlingguy.certbot -- src: geerlingguy.node_exporter \ No newline at end of file diff --git a/ansible/requirements/ansible-galaxy.yml b/ansible/requirements/ansible-galaxy.yml new file mode 100644 index 00000000..e78d86bb --- /dev/null +++ b/ansible/requirements/ansible-galaxy.yml @@ -0,0 +1,15 @@ +- src: willshersystems.sshd + version: v0.25.0 +- src: nginxinc.nginx + version: 0.24.3 +- src: geerlingguy.certbot + version: 5.2.0 +- src: artis3n.tailscale + version: v4.5.0 +- src: https://github.com/idealista/clickhouse_role + scm: git + version: 3.5.1 + name: idealista.clickhouse_role +- src: https://github.com/ooni/airflow-role.git + scm: git + name: ooni.airflow_role diff --git a/ansible/requirements/python.yml b/ansible/requirements/python.yml new file mode 100644 index 00000000..5c42f3dc --- /dev/null +++ b/ansible/requirements/python.yml @@ -0,0 +1,3 @@ +ansible==9.3.0 +boto3==1.34.65 +dnspython==2.6.1 diff --git a/ansible/roles/base-backend/README.adoc b/ansible/roles/base-backend/README.adoc new file mode 100644 index 00000000..ac3f7039 --- /dev/null +++ b/ansible/roles/base-backend/README.adoc @@ -0,0 +1 @@ +Configure base host based on backend hosts diff --git a/ansible/roles/base-backend/handlers/main.yml b/ansible/roles/base-backend/handlers/main.yml new file mode 100644 index 00000000..4a8d06e8 --- /dev/null +++ b/ansible/roles/base-backend/handlers/main.yml @@ -0,0 +1,15 @@ +- name: reload nftables + tags: nftables + ansible.builtin.systemd_service: + name: nftables + state: reloaded + +- name: restart chrony + ansible.builtin.systemd: + name: chrony.service + state: restarted + +- name: restart netdata + ansible.builtin.systemd: + name: netdata.service + state: restarted diff --git a/ansible/roles/base-backend/meta/main.yml b/ansible/roles/base-backend/meta/main.yml new file mode 100644 index 00000000..5de9bc56 --- /dev/null +++ b/ansible/roles/base-backend/meta/main.yml @@ -0,0 +1,6 @@ +--- +dependencies: + - role: adm + become: false + remote_user: root + gather_facts: false diff --git a/ansible/roles/base-backend/tasks/main.yml b/ansible/roles/base-backend/tasks/main.yml new file mode 100644 index 00000000..00a7352a --- /dev/null +++ b/ansible/roles/base-backend/tasks/main.yml @@ -0,0 +1,140 @@ +--- +- name: motd + shell: echo "" > /etc/motd + +- name: Remove apt repo + tags: apt + file: + path: /etc/apt/sources.list.d/ftp_nl_debian_org_debian.list + state: absent + +- name: Remove apt repo + tags: apt + file: + path: /etc/apt/sources.list.d/security_debian_org.list + state: absent + +- name: Create internal-deb repo GPG pubkey + tags: apt + template: + src: templates/internal-deb.gpg + dest: /etc/ooni/internal-deb.gpg + mode: 0644 + owner: root + +- name: Set apt repos + tags: apt + template: + src: templates/sources.list + dest: /etc/apt/sources.list + mode: 0644 + owner: root + +- name: Install gpg + tags: base-packages + apt: + install_recommends: no + cache_valid_time: 86400 + name: + - gpg + - gpg-agent + +- name: Update apt cache + tags: apt + apt: + update_cache: yes + +- name: Installs base packages + tags: base-packages + apt: + install_recommends: no + cache_valid_time: 86400 + name: + - bash-completion + - byobu + - chrony + - etckeeper + - fail2ban + - git + - iotop + - jupyter-notebook + - manpages + - ncdu + - netdata-core + - netdata-plugins-bash + - netdata-plugins-python + - netdata-web + - nftables + - nullmailer + - prometheus-node-exporter + - pv + # needed by ansible + - python3-apt + - rsync + - ssl-cert + - strace + - tcpdump + - tmux + - vim + +- name: Autoremove + tags: autoremove + apt: + autoremove: yes + +- name: Clean cache + tags: apt + apt: + autoclean: yes + +- name: allow netdata.service + tags: netdata + blockinfile: + path: /etc/ooni/nftables/tcp/19999.nft + create: yes + block: | + add rule inet filter input ip saddr {{ lookup('dig', 'prometheus.ooni.org/A') }} tcp dport 19999 counter accept comment "netdata.service" + notify: + - reload nftables + +- name: configure netdata.service + tags: netdata + template: + src: netdata.conf + dest: /etc/netdata/netdata.conf + +- name: disable netdata emails + tags: netdata + blockinfile: + path: /etc/netdata/conf.d/health_alarm_notify.conf + create: yes + block: | + # Managed by ansible, see roles/base-bookworm/tasks/main.yml + SEND_EMAIL="NO" + +- name: Set timezone + tags: timezone + timezone: + name: Etc/UTC + notify: + - restart chrony + +- name: configure netdata chrony + tags: netdata, timezone + blockinfile: + path: /etc/netdata/python.d/chrony.conf + create: yes + block: | + # Managed by ansible, see roles/base-bookworm/tasks/main.yml + update_every: 5 + local: + command: 'chronyc -n tracking' + +- name: configure netdata chrony + tags: netdata, timezone + lineinfile: + path: /usr/lib/netdata/conf.d/python.d.conf + regexp: '^chrony:' + line: 'chrony: yes' + notify: + - restart netdata diff --git a/ansible/roles/base-backend/templates/internal-deb.gpg b/ansible/roles/base-backend/templates/internal-deb.gpg new file mode 100644 index 00000000..28126a36 --- /dev/null +++ b/ansible/roles/base-backend/templates/internal-deb.gpg @@ -0,0 +1,14 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mDMEYGISFRYJKwYBBAHaRw8BAQdA4VxoR0gSsH56BbVqYdK9HNQ0Dj2YFVbvKIIZ +JKlaW920Mk9PTkkgcGFja2FnZSBzaWduaW5nIDxjb250YWN0QG9wZW5vYnNlcnZh +dG9yeS5vcmc+iJYEExYIAD4WIQS1oI8BeW5/UhhhtEk3LR/ycfLdUAUCYGISFQIb +AwUJJZgGAAULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAKCRA3LR/ycfLdUFk+AQCb +gsUQsAQGxUFvxk1XQ4RgEoh7wy2yTuK8ZCkSHJ0HWwD/f2OAjDigGq07uJPYw7Uo +Ih9+mJ/ubwiPMzUWF6RSdgu4OARgYhIVEgorBgEEAZdVAQUBAQdAx4p1KerwcIhX +HfM9LbN6Gi7z9j4/12JKYOvr0d0yC30DAQgHiH4EGBYIACYWIQS1oI8BeW5/Uhhh +tEk3LR/ycfLdUAUCYGISFQIbDAUJJZgGAAAKCRA3LR/ycfLdUL4cAQCs53fLphhy +6JMwVhRs02LXi1lntUtw1c+EMn6t7XNM6gD+PXpbgSZwoV3ZViLqr58o9fZQtV3s +oN7jfdbznrWVigE= +=PtYb +-----END PGP PUBLIC KEY BLOCK----- diff --git a/ansible/roles/base-backend/templates/journald.conf b/ansible/roles/base-backend/templates/journald.conf new file mode 100644 index 00000000..d7ae85e1 --- /dev/null +++ b/ansible/roles/base-backend/templates/journald.conf @@ -0,0 +1,8 @@ +[Journal] +Storage=persistent +Compress=yes +#RateLimitIntervalSec=30s +#RateLimitBurst=10000 +SystemMaxFileSize=200M +RuntimeMaxFileSize=1G +ForwardToSyslog=no diff --git a/ansible/roles/base-backend/templates/netdata.conf b/ansible/roles/base-backend/templates/netdata.conf new file mode 100644 index 00000000..e2bef302 --- /dev/null +++ b/ansible/roles/base-backend/templates/netdata.conf @@ -0,0 +1,32 @@ +# Managed by ansible, see roles/base-bookworm/tasks/main.yml +[global] + run as user = netdata + web files owner = root + web files group = root + bind socket to IP = 0.0.0.0 + +[plugins] + python.d = yes + + +[statsd] + enabled = yes + # decimal detail = 1000 + update every (flushInterval) = 1 + # udp messages to process at once = 10 + # create private charts for metrics matching = * + max private charts allowed = 10000 + max private charts hard limit = 10000 + private charts memory mode = ram + private charts history = 300 + # histograms and timers percentile (percentThreshold) = 95.00000 + # add dimension for number of events received = no + # gaps on gauges (deleteGauges) = no + # gaps on counters (deleteCounters) = no + # gaps on meters (deleteMeters) = no + # gaps on sets (deleteSets) = no + # gaps on histograms (deleteHistograms) = no + # gaps on timers (deleteTimers) = no + # listen backlog = 4096 + # default port = 8125 + # bind to = udp:localhost:8125 tcp:localhost:8125 diff --git a/ansible/roles/base-backend/templates/ooni_internal.sources b/ansible/roles/base-backend/templates/ooni_internal.sources new file mode 100644 index 00000000..f85bc625 --- /dev/null +++ b/ansible/roles/base-backend/templates/ooni_internal.sources @@ -0,0 +1,7 @@ +Architectures: amd64 +Suites: unstable +Uris: https://ooni-internal-deb.s3.eu-central-1.amazonaws.com +Types: deb +Components: main +Enabled: yes +Signed-By: /etc/ooni/internal-deb.gpg diff --git a/ansible/roles/base-backend/templates/resolved.conf b/ansible/roles/base-backend/templates/resolved.conf new file mode 100644 index 00000000..aa68eaf1 --- /dev/null +++ b/ansible/roles/base-backend/templates/resolved.conf @@ -0,0 +1,9 @@ +# Deployed by ansible +# See roles/base-bookworm/templates/resolved.conf + +[Resolve] +DNS=9.9.9.9 +FallbackDNS=1.1.1.1 8.8.8.8 +DNSOverTLS=opportunistic +DNSSEC=allow-downgrade +Cache=yes diff --git a/ansible/roles/base-backend/templates/sources.list b/ansible/roles/base-backend/templates/sources.list new file mode 100644 index 00000000..7432ddad --- /dev/null +++ b/ansible/roles/base-backend/templates/sources.list @@ -0,0 +1,6 @@ +# Managed by ansible +# roles/base-bookworm/templates/sources.list + +deb http://deb.debian.org/debian bookworm main contrib non-free-firmware +deb http://deb.debian.org/debian-security/ bookworm-security main contrib non-free-firmware +deb http://deb.debian.org/debian bookworm-backports main diff --git a/ansible/roles/bootstrap/handlers/main.yml b/ansible/roles/bootstrap/handlers/main.yml new file mode 100644 index 00000000..a9c712a4 --- /dev/null +++ b/ansible/roles/bootstrap/handlers/main.yml @@ -0,0 +1,18 @@ +- name: Restart chrony + ansible.builtin.systemd_service: + name: chrony.service + state: restarted + +- name: Restart systemd-resolved + ansible.builtin.systemd_service: + name: systemd-resolved.service + state: restarted + +- name: Test systemd-resolved + ansible.builtin.shell: resolvectl query go.dnscheck.tools --cache=no + +- name: Restart systemd-journald + ansible.builtin.systemd_service: + name: systemd-journald.service + state: restarted + enabled: yes diff --git a/ansible/roles/bootstrap/tasks/main.yml b/ansible/roles/bootstrap/tasks/main.yml index 88cd3a78..500d58ff 100644 --- a/ansible/roles/bootstrap/tasks/main.yml +++ b/ansible/roles/bootstrap/tasks/main.yml @@ -1,3 +1,11 @@ +- name: write bashrc template + ansible.builtin.template: + src: bashrc + dest: /etc/skel/.bashrc + owner: root + group: root + mode: u=rw,g=r,o=r + - ansible.builtin.include_role: name: ssh_users tags: @@ -13,6 +21,7 @@ - bash-completion - ca-certificates - curl + - chrony - file - git - htop @@ -22,6 +31,7 @@ - man-db - mtr - net-tools + - nvme-cli - openssl - python3-passlib - rsync @@ -34,12 +44,41 @@ update_cache: yes install_recommends: no +- name: Set timezone + community.general.timezone: + name: Etc/UTC + notify: + - Restart chrony + - ansible.builtin.include_role: name: nftables tags: - nftables -- ansible.builtin.include_role: - name: prometheus_node_exporter +- name: Configure journald tags: - - node_exporter + - journald + template: + src: templates/journald.conf + dest: /etc/systemd/journald.conf + mode: 0644 + owner: root + notify: + - Restart systemd-journald + +- name: install systemd-resolved + tags: resolved + ansible.builtin.apt: + install_recommends: no + cache_valid_time: 86400 + name: + - systemd-resolved + +- name: configure systemd-resolved + tags: resolved + ansible.builtin.template: + src: resolved.conf + dest: /etc/systemd/resolved.conf + notify: + - Restart systemd-resolved + - Test systemd-resolved diff --git a/ansible/roles/bootstrap/templates/bashrc b/ansible/roles/bootstrap/templates/bashrc new file mode 100644 index 00000000..4d34923b --- /dev/null +++ b/ansible/roles/bootstrap/templates/bashrc @@ -0,0 +1,113 @@ +# ~/.bashrc: executed by bash(1) for non-login shells. +# see /usr/share/doc/bash/examples/startup-files (in the package bash-doc) +# for examples + +# If not running interactively, don't do anything +case $- in + *i*) ;; + *) return;; +esac + +# don't put duplicate lines or lines starting with space in the history. +# See bash(1) for more options +HISTCONTROL=ignoreboth + +# append to the history file, don't overwrite it +shopt -s histappend + +# for setting history length see HISTSIZE and HISTFILESIZE in bash(1) +HISTSIZE=1000 +HISTFILESIZE=2000 + +# check the window size after each command and, if necessary, +# update the values of LINES and COLUMNS. +shopt -s checkwinsize + +# If set, the pattern "**" used in a pathname expansion context will +# match all files and zero or more directories and subdirectories. +#shopt -s globstar + +# make less more friendly for non-text input files, see lesspipe(1) +#[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)" + +# set variable identifying the chroot you work in (used in the prompt below) +if [ -z "${debian_chroot:-}" ] && [ -r /etc/debian_chroot ]; then + debian_chroot=$(cat /etc/debian_chroot) +fi + +# set a fancy prompt (non-color, unless we know we "want" color) +case "$TERM" in + xterm-color|*-256color) color_prompt=yes;; +esac + +# uncomment for a colored prompt, if the terminal has the capability; turned +# off by default to not distract the user: the focus in a terminal window +# should be on the output of commands, not on the prompt +#force_color_prompt=yes + +if [ -n "$force_color_prompt" ]; then + if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then + # We have color support; assume it's compliant with Ecma-48 + # (ISO/IEC-6429). (Lack of such support is extremely rare, and such + # a case would tend to support setf rather than setaf.) + color_prompt=yes + else + color_prompt= + fi +fi + +if [ "$color_prompt" = yes ]; then + PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\H\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ ' +else + PS1='${debian_chroot:+($debian_chroot)}\u@\H:\w\$ ' +fi +unset color_prompt force_color_prompt + +# If this is an xterm set the title to user@host:dir +case "$TERM" in +xterm*|rxvt*) + PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1" + ;; +*) + ;; +esac + +# enable color support of ls and also add handy aliases +if [ -x /usr/bin/dircolors ]; then + test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)" + alias ls='ls --color=auto' + #alias dir='dir --color=auto' + #alias vdir='vdir --color=auto' + + #alias grep='grep --color=auto' + #alias fgrep='fgrep --color=auto' + #alias egrep='egrep --color=auto' +fi + +# colored GCC warnings and errors +#export GCC_COLORS='error=01;31:warning=01;35:note=01;36:caret=01;32:locus=01:quote=01' + +# some more ls aliases +#alias ll='ls -l' +#alias la='ls -A' +#alias l='ls -CF' + +# Alias definitions. +# You may want to put all your additions into a separate file like +# ~/.bash_aliases, instead of adding them here directly. +# See /usr/share/doc/bash-doc/examples in the bash-doc package. + +if [ -f ~/.bash_aliases ]; then + . ~/.bash_aliases +fi + +# enable programmable completion features (you don't need to enable +# this, if it's already enabled in /etc/bash.bashrc and /etc/profile +# sources /etc/bash.bashrc). +if ! shopt -oq posix; then + if [ -f /usr/share/bash-completion/bash_completion ]; then + . /usr/share/bash-completion/bash_completion + elif [ -f /etc/bash_completion ]; then + . /etc/bash_completion + fi +fi diff --git a/ansible/roles/bootstrap/templates/journald.conf b/ansible/roles/bootstrap/templates/journald.conf new file mode 100644 index 00000000..e06ebc4b --- /dev/null +++ b/ansible/roles/bootstrap/templates/journald.conf @@ -0,0 +1,11 @@ +# ansible managed +# see: roles/bootstrap/templates + +[Journal] +Storage=persistent +Compress=yes +#RateLimitIntervalSec=30s +#RateLimitBurst=10000 +SystemMaxFileSize=200M +RuntimeMaxFileSize=1G +ForwardToSyslog=no diff --git a/ansible/roles/bootstrap/templates/resolved.conf b/ansible/roles/bootstrap/templates/resolved.conf new file mode 100644 index 00000000..834d505d --- /dev/null +++ b/ansible/roles/bootstrap/templates/resolved.conf @@ -0,0 +1,9 @@ +# Deployed by ansible +# See roles/bootstrap/templates/resolved.conf + +[Resolve] +DNS=9.9.9.9 +FallbackDNS=1.1.1.1 8.8.8.8 +DNSOverTLS=opportunistic +DNSSEC=allow-downgrade +Cache=yes diff --git a/ansible/roles/clickhouse/tasks/main.yml b/ansible/roles/clickhouse/tasks/main.yml deleted file mode 100644 index ee7d90bb..00000000 --- a/ansible/roles/clickhouse/tasks/main.yml +++ /dev/null @@ -1,73 +0,0 @@ -- name: install clickhouse requirements - tags: clickhouse - apt: - cache_valid_time: 86400 - state: present - name: - - apt-transport-https - - ca-certificates - - dirmngr - -- name: Check if ClickHouse GPG keyring exists - ansible.builtin.stat: - path: /usr/share/keyrings/clickhouse-keyring.gpg - register: keyring_check - -- name: Create a temporary directory for GPG - ansible.builtin.tempfile: - state: directory - register: gnupg_temp_dir - when: not keyring_check.stat.exists - -- name: Import ClickHouse GPG key - ansible.builtin.command: - cmd: "gpg --no-default-keyring --keyring /usr/share/keyrings/clickhouse-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 8919F6BD2B48D754" - chdir: "{{ gnupg_temp_dir.path }}" - creates: "/usr/share/keyrings/clickhouse-keyring.gpg" - environment: - GNUPGHOME: "{{ gnupg_temp_dir.path }}" - when: not keyring_check.stat.exists - -- name: Remove temporary directory - ansible.builtin.file: - path: "{{ gnupg_temp_dir.path }}" - state: absent - when: not keyring_check.stat.exists - -- name: Ensure the keyring is readable - ansible.builtin.file: - path: /usr/share/keyrings/clickhouse-keyring.gpg - mode: a+r - -- name: Add ClickHouse repository - ansible.builtin.apt_repository: - repo: "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb stable main" - state: present - filename: clickhouse - -- name: Install ClickHouse server and client - ansible.builtin.apt: - name: - - clickhouse-server={{ clickhouse_pkg_ver }} - - clickhouse-client={{ clickhouse_pkg_ver }} - - clickhouse-common-static={{ clickhouse_pkg_ver }} - state: present - update_cache: yes - vars: - clickhouse_pkg_ver: 24.1.* - -- name: Ensure ClickHouse service is started and enabled - ansible.builtin.systemd: - name: clickhouse-server - state: started - enabled: yes - -- name: Configure ClickHouse users from template - template: - src: templates/ooni_users.xml - dest: /etc/clickhouse-server/users.d/ooni_users.xml - owner: clickhouse - group: clickhouse - mode: '0640' - notify: - - restart clickhouse-server diff --git a/ansible/roles/clickhouse/templates/ooni_users.xml b/ansible/roles/clickhouse/templates/ooni_users.xml deleted file mode 100644 index 26081944..00000000 --- a/ansible/roles/clickhouse/templates/ooni_users.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - - 1 - - - - - - - 1 - admin - - 127.0.0.1 - - - - - - readonly - - 0.0.0.0 - - {{ clickhouse_reader_password }} - - - - - diff --git a/ansible/roles/dehydrated/README.adoc b/ansible/roles/dehydrated/README.adoc new file mode 100644 index 00000000..477601de --- /dev/null +++ b/ansible/roles/dehydrated/README.adoc @@ -0,0 +1,10 @@ + +Configure dehydrated to generate certificates (locally to each server) + +- listen on port 443 for ACME challenge + +- ansible --diff is supported + +- generate certificate expirations metrics for node exporter + +- changes to /etc are also tracked locally by etckeeper diff --git a/ansible/roles/dehydrated/meta/main.yml b/ansible/roles/dehydrated/meta/main.yml new file mode 100644 index 00000000..0e72e865 --- /dev/null +++ b/ansible/roles/dehydrated/meta/main.yml @@ -0,0 +1,5 @@ +--- +dependencies: + - nginx +... + diff --git a/ansible/roles/dehydrated/tasks/main.yml b/ansible/roles/dehydrated/tasks/main.yml new file mode 100644 index 00000000..0a84f1a0 --- /dev/null +++ b/ansible/roles/dehydrated/tasks/main.yml @@ -0,0 +1,100 @@ +--- +- name: Installs packages + tags: dehydrated + apt: + install_recommends: no + cache_valid_time: 86400 + name: + - dehydrated + +#- name: create dehydrated hook file +# # This hook is called after getting a new cert to deploy it +# template: +# src: templates/hook.sh +# dest: /etc/dehydrated/hook.sh +# mode: 0755 +# owner: root +# +# +#- name: set dehydrated hook +# blockinfile: +# path: /etc/dehydrated/config +# block: | +# HOOK="/etc/dehydrated/hook.sh" + +- name: Add ACME dedicated sites-enabled file + tags: dehydrated + template: + src: templates/letsencrypt-http + # the server block matches all SSL FQDNs and must be + # parsed first, hence 00- + dest: /etc/nginx/sites-enabled/00-letsencrypt-http + mode: 0644 + owner: root + +- name: Add canary file to ensure /.well-known/acme-challenge is reachable by let's encrypt + tags: dehydrated + copy: + content: | + Generated by ansible using ansible/roles/dehydrated/tasks/main.yml. + + Also, meow!!! + dest: /var/lib/dehydrated/acme-challenges/ooni-acme-canary + mode: 0644 + owner: root + +- name: reload nginx + tags: dehydrated + shell: systemctl reload nginx.service + +- name: reload nftables service + tags: dehydrated + shell: systemctl reload nftables.service + +- name: Configure domains {{ ssl_domains }} + # https://github.com/dehydrated-io/dehydrated/blob/master/docs/domains_txt.md + tags: dehydrated + template: + src: templates/domains.txt.j2 + dest: /etc/dehydrated/domains.txt + +- name: Register account if needed + tags: dehydrated + ansible.builtin.shell: + cmd: "test -d /var/lib/dehydrated/accounts || dehydrated --register --accept-terms" + +- name: Install dehydrated.service + tags: dehydrated + template: + src: templates/dehydrated.service + dest: /etc/systemd/system/dehydrated.service + mode: 0644 + owner: root + +- name: Install dehydrated.timer + tags: dehydrated + template: + src: templates/dehydrated.timer + dest: /etc/systemd/system/dehydrated.timer + mode: 0644 + owner: root + +- name: Ensure timer runs + tags: dehydrated + systemd: + name: dehydrated.timer + state: started + enabled: yes + +- name: Run dehydrated service immediately + # creates: + # /var/lib/dehydrated/certs//chain.pem cert.pem privkey.pem fullchain.pem + tags: dehydrated + systemd: + name: dehydrated.service + state: started + enabled: yes + +- name: reload nginx + tags: dehydrated + shell: systemctl reload nginx.service diff --git a/ansible/roles/dehydrated/templates/dehydrated.service b/ansible/roles/dehydrated/templates/dehydrated.service new file mode 100644 index 00000000..50ffdc46 --- /dev/null +++ b/ansible/roles/dehydrated/templates/dehydrated.service @@ -0,0 +1,13 @@ +[Unit] +Description=Run dehydrated certificate refresh + +[Service] +Type=oneshot +#User=dehydrated +#Group=dehydrated +ProtectSystem=strict +ProtectHome=yes +ReadWritePaths=/var/lib/dehydrated +PrivateTmp=yes +ExecStart=/usr/bin/dehydrated --cron +ExecStartPost=+/bin/systemctl reload nginx.service diff --git a/ansible/roles/dehydrated/templates/dehydrated.timer b/ansible/roles/dehydrated/templates/dehydrated.timer new file mode 100644 index 00000000..5e6ea784 --- /dev/null +++ b/ansible/roles/dehydrated/templates/dehydrated.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Run dehydrated certificate refresh + +[Timer] +OnCalendar=Mon 13:00 + +[Install] +WantedBy=timers.target + diff --git a/ansible/roles/dehydrated/templates/domains.txt.j2 b/ansible/roles/dehydrated/templates/domains.txt.j2 new file mode 100644 index 00000000..5850d203 --- /dev/null +++ b/ansible/roles/dehydrated/templates/domains.txt.j2 @@ -0,0 +1 @@ +{% for d in ssl_domains %}{{ d }} {% endfor %} diff --git a/ansible/roles/dehydrated/templates/hook.sh b/ansible/roles/dehydrated/templates/hook.sh new file mode 100644 index 00000000..26193aeb --- /dev/null +++ b/ansible/roles/dehydrated/templates/hook.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +# Deployed by ansible +# see ansible/roles/dehydrated/templates/hook.sh +# +deploy_cert() { + local DOMAIN="${1}" KEYFILE="${2}" CERTFILE="${3}" FULLCHAINFILE="${4}" CHAINFILE="${5}" TIMESTAMP="${6}" + # This hook is called once for each certificate that has been produced. + # Parameters: + # - DOMAIN The primary domain name, i.e. the certificate common name (CN). + # - KEYFILE The path of the file containing the private key. + # - CERTFILE The path of the file containing the signed certificate. + # - FULLCHAINFILE The path of the file containing the full certificate chain. + # - CHAINFILE The path of the file containing the intermediate certificate(s). + # - TIMESTAMP Timestamp when the specified certificate was created. + + logger "Deploying SSL certificate $DOMAIN $KEYFILE $CERTFILE $FULLCHAINFILE $CHAINFILE $TIMESTAMP" + # cp ... + #systemctl reload nginx +} diff --git a/ansible/roles/dehydrated/templates/letsencrypt-http b/ansible/roles/dehydrated/templates/letsencrypt-http new file mode 100644 index 00000000..41fda273 --- /dev/null +++ b/ansible/roles/dehydrated/templates/letsencrypt-http @@ -0,0 +1,13 @@ +# Generated by ansible +# roles/dehydrated/templates/letsencrypt-http + +server { + # Listen on port 80 for *any* domain + listen 80; + server_name _; + + # Serve ACME challenge from disk + location ^~ /.well-known/acme-challenge { + alias /var/lib/dehydrated/acme-challenges; + } +} diff --git a/ansible/roles/miniconda/tasks/main.yml b/ansible/roles/miniconda/tasks/main.yml index 0ea358b3..fa195d36 100644 --- a/ansible/roles/miniconda/tasks/main.yml +++ b/ansible/roles/miniconda/tasks/main.yml @@ -3,6 +3,7 @@ ansible.builtin.user: name: miniconda shell: /bin/false + home: "{{ miniconda_install_dir }}" - name: Check if Miniconda is installed ansible.builtin.stat: @@ -11,11 +12,3 @@ - include_tasks: install.yml when: not miniconda_bin.stat.exists - -- name: "install conda packages" - ansible.builtin.shell: - cmd: "{{ miniconda_install_dir }}/bin/conda install -y {{ item }}" - loop: - - pandas - - numpy - - altair diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml new file mode 100644 index 00000000..47ef408f --- /dev/null +++ b/ansible/roles/monitoring/defaults/main.yml @@ -0,0 +1 @@ +enable_log_ingestion: false diff --git a/ansible/roles/monitoring/files/create_logs_table.sql b/ansible/roles/monitoring/files/create_logs_table.sql new file mode 100644 index 00000000..fe6a4cfa --- /dev/null +++ b/ansible/roles/monitoring/files/create_logs_table.sql @@ -0,0 +1,56 @@ +CREATE TABLE IF NOT EXISTS default.logs +( + `CODE_FILE` String, + `CODE_FUNC` String, + `CODE_LINE` String, + `INVOCATION_ID` String, + `LOGGER` LowCardinality(String), + `MESSAGE_ID` String, + `MESSAGE` String, + `PRIORITY` UInt8, + `PROCESS_NAME` String, + `SYSLOG_FACILITY` LowCardinality(String), + `SYSLOG_IDENTIFIER` LowCardinality(String), + `SYSLOG_PID` Nullable(UInt64), + `SYSLOG_TIMESTAMP` String, + `THREAD_NAME` String, + `TID` UInt64, + `UNIT` String, + `_AUDIT_LOGINUID` Nullable(UInt64), + `_AUDIT_SESSION` Nullable(UInt64), + `_BOOT_ID` String, + `_CAP_EFFECTIVE` String, + `_CMDLINE` String, + `_COMM` LowCardinality(String), + `_EXE` LowCardinality(String), + `_GID` LowCardinality(UInt32), + `_HOSTNAME` String, + `_KERNEL_DEVICE` String, + `_KERNEL_SUBSYSTEM` String, + `_MACHINE_ID` String, + `_PID` UInt32, + `_SELINUX_CONTEXT` String, + `_SOURCE_MONOTONIC_TIMESTAMP` Nullable(Int64), + `_SOURCE_REALTIME_TIMESTAMP` Int64, + `_STREAM_ID` String, + `_SYSTEMD_CGROUP` LowCardinality(String), + `_SYSTEMD_INVOCATION_ID` String, + `_SYSTEMD_SLICE` String, + `_SYSTEMD_UNIT` LowCardinality(String), + `_TRANSPORT` LowCardinality(String), + `_UDEV_SYSNAME` String, + `_UID` LowCardinality(UInt32), + `__CURSOR` String, + `__MONOTONIC_TIMESTAMP` Nullable(Int64), + `__REALTIME_TIMESTAMP` Int64, + `date` DateTime64(6) ALIAS fromUnixTimestamp64Micro(_SOURCE_REALTIME_TIMESTAMP), + `host` LowCardinality(String), + `inserted_at` DateTime DEFAULT now(), + `message` String, + `rtdate` DateTime64(6) ALIAS fromUnixTimestamp64Micro(__REALTIME_TIMESTAMP), + `timestamp` String, + INDEX timestamp_minmax_idx timestamp TYPE minmax GRANULARITY 1 +) +ENGINE = MergeTree +ORDER BY __REALTIME_TIMESTAMP +SETTINGS index_granularity = 8192 diff --git a/ansible/roles/monitoring/files/log-ingestion.service b/ansible/roles/monitoring/files/log-ingestion.service new file mode 100644 index 00000000..ac1e9483 --- /dev/null +++ b/ansible/roles/monitoring/files/log-ingestion.service @@ -0,0 +1,17 @@ +[Unit] +Description=log ingestion + +[Service] +ExecStart=/bin/sh -c 'journalctl -ojson -f | clickhouse-client --query="INSERT INTO logs FORMAT JSONEachRow" --input_format_skip_unknown_fields=1 --input_format_allow_errors_ratio=1' + +SystemCallFilter=~@clock @debug @cpu-emulation @keyring @module @mount @obsolete @raw-io @reboot @swap +NoNewPrivileges=yes +PrivateDevices=yes +PrivateTmp=yes +ProtectHome=yes +ProtectSystem=full +ProtectKernelModules=yes +ProtectKernelTunables=yes + +[Install] +WantedBy=multi-user.target diff --git a/ansible/roles/monitoring/tasks/log-ingestion.yml b/ansible/roles/monitoring/tasks/log-ingestion.yml new file mode 100644 index 00000000..645f086f --- /dev/null +++ b/ansible/roles/monitoring/tasks/log-ingestion.yml @@ -0,0 +1,85 @@ +# # Vector + +- name: vector - enable repo + tags: vector + shell: extrepo enable vector && extrepo update vector + +- name: vector - install pkg + tags: vector + apt: + # refresh cache + cache_valid_time: 0 + name: + - vector + +- name: vector - deploy SQL file to create logs table + tags: vector + copy: + src: create_logs_table.sql + dest: /etc/clickhouse-server/create_logs_table.sql + +- name: vector - create vector_logs table + tags: vector + command: clickhouse-client --multiline --multiquery --queries-file /etc/clickhouse-server/create_logs_table.sql + +- name: vector - Generate syslog certificates + tags: vector + # runs locally + delegate_to: 127.0.0.1 + shell: | + ./vault view files/pusher_ca.key.vault | openssl req -x509 -new -nodes -key /dev/stdin -sha256 -days 3650 -subj '/O=OONI/OU=CA/CN=ooni.org' -out oonicacert.pem + openssl req -newkey rsa:2048 -nodes -days 3650 -keyout node.key -out node-req.pem -subj '/CN=ooni.org/O=OONI temp CA/C=US' -batch + ./vault view files/pusher_ca.key.vault | openssl x509 -req -days 3650 -set_serial 01 -in node-req.pem -out node-cert.pem -CA oonicacert.pem -CAkey /dev/stdin + register: certs_ready + +- name: vector - Copy TLS certs + tags: vector + ansible.builtin.copy: + src: "{{ item }}" + dest: /etc/vector/ + mode: '0440' + owner: vector + loop: + - oonicacert.pem + - node-cert.pem + - node.key + when: certs_ready.changed + +- name: vector - Delete files + tags: vector + # runs locally + delegate_to: 127.0.0.1 + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - node-cert.pem + - node-req.pem + - node.key + - oonicacert.pem + +- name: vector - configure + tags: vector + template: + src: templates/vector.toml + dest: /etc/vector/vector.toml + +- name: vector - open port + tags: vector + ansible.builtin.copy: + src: templates/10514.nft + dest: /etc/ooni/nftables/tcp/ + register: nft_reload_needed + +- name: vector - reload nft + tags: vector + shell: systemctl reload nftables.service + when: nft_reload_needed.changed + +- name: vector - restart service + tags: vector + systemd: + daemon_reload: yes + enabled: yes + name: vector.service + state: restarted diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 00000000..24a45cf1 --- /dev/null +++ b/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,304 @@ +--- +# # monitoring host # # + +- name: Set grafana apt repo + tags: monitoring, grafana + template: + src: templates/grafana.list + dest: /etc/apt/sources.list.d/grafana.list + mode: 0644 + owner: root + +- name: Installs packages + tags: monitoring, prometheus + apt: + install_recommends: no + cache_valid_time: 86400 + name: + - apt-transport-https + - nginx + - prometheus + - prometheus-blackbox-exporter + - extrepo + +- name: Give cap_net_raw to prometheus-blackbox-exporter + tags: monitoring, prometheus, exporter_cap_net_raw + community.general.capabilities: + path: /usr/bin/prometheus-blackbox-exporter + capability: cap_net_raw+ep + state: present + +- name: Create Prometheus environment override + # Disable strict cert check https://pkg.go.dev/crypto/x509#Certificate.VerifyHostname + tags: monitoring, prometheus, override_cert_check + template: + src: templates/etc_default_prometheus + dest: /etc/default/prometheus + mode: 0644 + owner: root + +- name: Create Grafana repo GPG pubkey + tags: apt + template: + src: templates/grafana.gpg + dest: /etc/apt/grafana.asc + mode: 0644 + owner: root + +- name: Create Grafana sources list + tags: apt + template: + src: templates/grafana.sources + dest: /etc/apt/sources.list.d/grafana.sources + mode: 0644 + owner: root + +- name: Installs grafana + tags: monitoring, grafana + apt: + install_recommends: no + cache_valid_time: 86400 + name: + - grafana + +- name: Configure grafana + tags: monitoring, grafana + lineinfile: + path: /etc/grafana/grafana.ini + regexp: '^;?domain = ' + line: domain = grafana.ooni.org + +- name: Autoremove + tags: monitoring + apt: + autoremove: yes + +- name: Clean cache + tags: monitoring + apt: + autoclean: yes + +- name: allow HTTPS + tags: monitoring + blockinfile: + path: /etc/ooni/nftables/tcp/443.nft + create: yes + block: | + add rule inet filter input tcp dport 443 counter accept comment "HTTPS" + +- name: reload nft + tags: monitoring + shell: systemctl reload nftables.service + +- name: enable grafana + tags: monitoring + shell: systemctl enable grafana-server + +- name: start grafana + tags: monitoring + shell: systemctl start grafana-server + +- name: Deploy nginx conf + tags: monitoring, grafana + template: + src: templates/nginx.conf + dest: /etc/nginx/sites-enabled/10-monitoring + mode: 0644 + owner: root + +# TODO(art): this htaccess file contains the password of federico which cannot +# be removed, since removing it leads to a bunch of alerts being triggerd. We +# should figure out where his personal password was used and replace it with a +# role based password that is shared and stored in our team keychain. +- name: copy monitoring.htpasswd + tags: monitoring, grafana, htpasswd + template: + src: templates/htpasswd + dest: /etc/nginx/monitoring.htpasswd + mode: 0440 + owner: www-data + group: www-data + +- name: reload nginx + tags: monitoring, grafana + shell: systemctl reload nginx + +- name: Installs packages + tags: jupyter + apt: + install_recommends: no + cache_valid_time: 86400 + name: + - jupyter-notebook + - jupyter-server + - python3-bottleneck + - python3-matplotlib + - python3-numpy + - python3-pandas + - python3-psycopg2 + - python3-scipy + - python3-seaborn + - python3-tables + - python3-tqdm + - python3-ujson + +- name: Install jupyter.service + tags: jupyter + template: + src: templates/jupyter.service + dest: /etc/systemd/system/jupyter.service + mode: 0755 + owner: root + +- name: create jupyter dir + tags: jupyter + file: + path: /var/lib/jupyter/conf + state: directory + owner: jupyter + group: jupyter + +- name: create jupyter conf + tags: jupyter + blockinfile: + path: /var/lib/jupyter/conf/jupyter_notebook_config.py + create: yes + owner: jupyter + group: jupyter + block: | + c.NotebookApp.allow_remote_access = True + c.NotebookApp.enable_mathjax = False + c.NotebookApp.open_browser = False + c.NotebookApp.password = u'' + c.NotebookApp.quit_button = False + c.NotebookApp.token = '' + +- name: reload systemd + tags: jupyter + shell: systemctl daemon-reload + +- name: Start jupyter + tags: jupyter + systemd: + name: jupyter.service + state: started + enabled: yes + +- name: Configure fail2ban + tags: fail2ban + lineinfile: + path: /etc/fail2ban/jail.conf + regexp: '^backend ' + line: backend = systemd + +- name: Configure fail2ban + tags: fail2ban + blockinfile: + create: yes + path: /etc/fail2ban/jail.d/ooni.conf + block: | + [nginx-http-auth] + enabled = true + filter = nginx-http-auth + port = http,https + journalmatch = _SYSTEMD_UNIT=nginx.service PRIORITY=3 + + [nginx-400] + enabled = true + port = http,https + filter = nginx-400 + maxretry = 2 + findtime = 300 + +- name: Configure fail2ban + tags: fail2ban + blockinfile: + create: yes + path: /etc/fail2ban/filter.d/nginx-400.conf + block: | + [Definition] + failregex = ^ -.*"(GET|POST|HEAD).*HTTP.*" 400 + ignoreregex = + +- name: reload fail2ban + tags: fail2ban + shell: systemctl reload fail2ban.service + +# jupycron + +- name: Install jupycron + tags: jupycron + apt: + cache_valid_time: 86400 + state: present + name: jupycron + +# # clickhouse # # + +- name: install clickhouse requirements + tags: clickhouse + apt: + cache_valid_time: 86400 + state: present + name: + - apt-transport-https + - ca-certificates + - dirmngr + +- name: install clickhouse keys + tags: clickhouse + template: + src: templates/clickhouse.gpg + dest: /etc/apt/clickhouse.asc + mode: 0644 + owner: root + +- name: set clickhouse repos + tags: clickhouse + template: + src: templates/clickhouse.sources + dest: /etc/apt/sources.list.d/clickhouse.sources + mode: 0644 + owner: root + +- name: pin clickhouse release train + tags: clickhouse + blockinfile: + path: /etc/apt/preferences.d/clickhouse-server + create: yes + block: | + Package: clickhouse-server + Pin: version 23.1.3.* + Pin-Priority: 999 + +- name: install clickhouse + tags: clickhouse + apt: + # refresh cache + cache_valid_time: 0 + name: + - clickhouse-server={{ clickhouse_pkg_ver }} + - clickhouse-client={{ clickhouse_pkg_ver }} + - clickhouse-common-static={{ clickhouse_pkg_ver }} + vars: + clickhouse_pkg_ver: 23.9.* + +#- name: install clickhouse conf override +# tags: clickhouse +# template: +# src: clickhouse_config.xml +# dest: /etc/clickhouse-server/config.d/ooni_conf.xml +# owner: clickhouse +# group: clickhouse +# mode: 0400 +# notify: restart clickhouse + +- name: Run clickhouse + tags: clickhouse + systemd: + name: clickhouse-server.service + state: started + enabled: yes + +- include_tasks: log-ingestion.yml + when: enable_log_ingestion diff --git a/ansible/roles/monitoring/templates/10514.nft b/ansible/roles/monitoring/templates/10514.nft new file mode 100644 index 00000000..1dd9dadc --- /dev/null +++ b/ansible/roles/monitoring/templates/10514.nft @@ -0,0 +1,2 @@ +# Manged by ansible roles/monitoring/templates/10514.nft +add rule inet filter input tcp dport 10514 counter accept comment "incoming logs" diff --git a/ansible/roles/monitoring/templates/clickhouse.gpg b/ansible/roles/monitoring/templates/clickhouse.gpg new file mode 100644 index 00000000..ae3a0dcf --- /dev/null +++ b/ansible/roles/monitoring/templates/clickhouse.gpg @@ -0,0 +1,87 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQINBGGbfsABEAC+01IxYaykEHIBW4dom/YipkmjPGbkU/oDibqsHE8VgDLC10Xt +Glto0H9q1fsTIi1nZ9S4lWUuuEAFgcPBt82qnT3E11ZCDC5khjFvPb7d0iK5Wg5T +a1hHNLxuqvRn4sOCyMp7LZJkgLFavvGnXWsHOmB3TzQRaQouc+Y21hqANXFCZXCD +be3Wbdy86ZHOezoWEdHaU1868ZlCPMq10lpBFcLzvdeEfMDAYhOE43c5X0tSbxNo +//kingPLPQ7nxbgsGDKNIsghazmKyMaJA+qpzMmu8qo1ioPZfW8GcYC06cBBAklQ +2dQmGqNvpdJeItBeis9kfJvfkWYCbQ5+ebmm/uJDGCXosIf3/OliMykJhiJzILp5 +1AJqCBD6muL5aP7UorGtVrCQeW2DKB6vtw73qoWmIAmMdW80Nr2lkFu8sDM8VeCI +YUULVa2mkuAet8B3TEtFs6dRZZZNlw6fNny6gmN1dbh+aDebilkHn7eOm8A3qUO3 +PgYNP8swIuGBDQ37lEKZaN6glL7+h0TY9Pz0UsBlCOOh0f3jrjVR2eseQfMtm5TU +/cqY7/+eTzycRpHRHl+xiSiUT1XwyzCf0sAEuMX6d1Um0KFXWWHYdpe4/P4yQmLw +Ex3JaYQHM7EBbLsj9vK7J9qAnrtH0kdyz1RBVE1SV0APS7qNec2GBUXh5QARAQAB +tDpDbGlja0hvdXNlIEluYy4gUmVwb3NpdG9yaWVzIEtleSA8cGFja2FnZXNAY2xp +Y2tob3VzZS5jb20+iQJOBBMBCAA4FiEEOp6hGTqXtUi+FFfUiRn2vStI11QFAmGb +fsACGwMFCwkIBwIGFQoJCAsCBBYCAwECHgECF4AACgkQiRn2vStI11R0OA/9GHe5 +Gfco4zWrPSX1cCM1sZZeLZUXeRu4iJlJPahi1sjP2J0rcuv5Xp7paW23el0ksXf4 +P+xknzzp8SlCndQKFWsb404vsNK5VtPB1RJ0Sje0dFM2nO1/Kt8pVX+r1WFJ1gur +BSugsAs3erTCy/ZIB6nhRcFBGsJfvZV24bVxFQKUW6ARigia3bqOZlM4ksdvjrLL +HZ84WiLY++W3wTMIh34KzLClVokl4au8L9Z1g+24EqJMRV7722FVm3GZDdVhmzNy +gKXQvopFvrfQY8Uk9ApZOVSsx9zH65UDP0u13ki7UTlyCi9ucGgT4w6cek1DUcXN +W0FU8498J80fADcWG4bsPjiJ7WiKI29saVked4gfUqx0G5kUCP+l5oo+okm/BFrt +66sO0cEbG9KZCmgdTa2KoNUtsampIBXDG0+9c2SWp3pa1j5sErngR2dyV6L8Muxx +dQJbVR4RxXkM8LkAUMN+YAatknMKEPa7gXalt73fiuvFXpY7NOxeUmlrRXucnCY8 +vSnTNPcuQYAeNyg7z2gyrpqp81Kd+XKUIdKYuydPpvRHzhFOBIf68NUc3MR1UeZx +eBF+bnVkkyL5nRhU+1eLZ0YyW2/PBCmegUahCHEWLpXGzeNBwdnt3HrJEQg54Fu8 +VPcKFa15kJ8QghTlMwYSQkXUPZeBOYaHtva+aSq5Ag0EYyw+bgEQAOECkzewc6Gd +lGA65PIqEPL4JORnsYr0fHNHYGJ+WCHC9HuQEcFJ8Q0R7nSNbMmi6KjnTqM4QdkL +myEQs9TKK5brFBa/ObbL4JVyBJTi4LdlCg95YfEQKTClb7roaLwKH+TykBJ617R+ +QUvFp65YCwzNiS2aKu6AC6zsD2rsrJulq5LIDKvWwLWfFtwRwOHMgviuGlj/97om +XUhuPIScJ5rB+QzICaiKriF/qVVO7obTV0OcUYrTOkNKo8H/q/fw0FSvX3eznz6f +K46gi468P34VO5ZLZs4UQih7EzDqdGz4Lo8GnIrtThWH5rLyFhm7PdB6yLwf8bQe +qDviI73Zb0w9ookYJQpBdglnDyLdhVjoAA0Hp3Q5XL6yvZVZz06YL3k1VWHI2H/A +WeceTFlrExmF2u1KousrwD8/583T042oRcGW97YV5Q9Uw7iffKvJiLp5VBZa319F +svF9hxM8bZVgVqBksBI/ScIvzMgG5FHeGflJ2xEIWWv2uDiAs498kmm5wI6Hdcfs +YuTT+uPsKSNd5G+Ts/39+hirXhVygtx6pFeCRVTdrfWcOfVIVilZm7I+qDNrap2w +6dT23zJT1DvZFCXymU9dxPEoLOm7cn9oxkvjs6c/YOIeyU5CSdI5ZdO99G/8TbFw +LKdSTajkRTVfIa99j/8P0jPMetnk7PoHABEBAAGJBGwEGAEIACAWIQQ6nqEZOpe1 +SL4UV9SJGfa9K0jXVAUCYyw+bgIbAgJACRCJGfa9K0jXVMF0IAQZAQgAHRYhBIhe +K9z5awtFq/BYRT5K1HGd3po4BQJjLD5uAAoJED5K1HGd3po4CjIQALM1EjN2zCtQ +ip9m5IuPnHA6d/YFIuNAXjAnZcg6HQSi+ERFu1w3mL2ya+ll3EYOhFMVMEnja0h0 +e+ezt+WMJt1R2eH1PMZEeJ2uMXgIOl0QsTUXWemhweTQEmsgqksQMqsvFuogxNP/ +Co+cKJCiBpMIU9FJfLrSoGGidnUqr48QjGRVx69FPPthlNOj6/KkhksmJFTT7YhM +G8V7K8yxiYIcQQvRfINLZfmXF2cvGlZU21Il43nCmNb4iFUnrUOouQ7k/Oe/OzZq +ggmcfPGp+6GoSJB3exr3NonjuEhmYR1VPe+WcgsWSl5RWVEaasN5C7dcekqWsqlA +UHpvIqDfzaL3s4HBp19uxxdiy42wXotDhT02P7DJVjVJf9XWgQbmaanzLdPnrUxw +UYcLEQM17VJ37j9DyW9zPBB/pKLRwDr89xpmAKMtRzeXImnBuUJHctp/c9dFvbhJ +GgxFgLHhBCUdTsEs7phLAB32n+oP2Mas6yRgG98k46ic+ZNdJIzplfXT96n/5u+a +eyQOkY3rZh6Jbhzwpb+r5Tj1rAXIJta6prkd8d3+1Rg37x9HkEAHn3ZPDk1RwbVO +VCU0UjMOPU/6Y3FsQq30jSuNhr70p5H/okwpkS7YX/7lY0068XEpWoIyJQIHcBM0 +HZCV904mwdqfRDujziIoQ/xEm3+zIVtzdgYP/3kX4kUup2DEUEY5ND3oKc5CAjyF +UGU7thzILHtt76WrrwXqqfxjJKl9+acmLlrBLj5FQwFEhThEeWunm2T5hhgr4FfF +4MXneQqVRhUslm5+sRIm/K1ye5q4NLfgiwZD/nTM5defyI4cynrby4fy1R9SCBLo +lzvPLpvEraegxIQ3Hk1kT5HmnXi79hZVbZxIkt6wNPG1rTS0TaJ2M37nmhm4GcNj +DtKnhy4Aq569DO8Sd0U6bYKIaFFoi7X5KYnKWY37pYFl0ERU0ffQp7Mg9ruHRvHh +kA41iQ5kF/O1fJHRUfXcYnfIfvN1C77TdXLQryrA/bRNS2dlulIu4CYH+lsfhlIj +BQrRoEOWGu6PzKBwPIXKcVFfB4BcG5si/3vZD3Q1xfUj3MV2sQcgVNbd/t7jJda3 +uhX9DiEPyqdgl382Rq+LoPV4az1igDLDUe7yPrCpD1rwJMyKJyStiP4i1jiIejVa +vPiCpXH1oxCQeR8KqjdrdNbSVgDOapPauZEcwJL464ZlWkdxKX09sN6GaXFTHYTE +XkxrzhlYZaysWEO86+iBUcHnRtc1D3dMJg7hi/Du1IouTwnBsYIwd9Fz/wHpDkiu +0Z3gCmYEg4JQj4iwAtNOQfhJzhEH4kHLom8j8XI/TlgJe/vtNUUUDp3i8qczTjua +VywRbE3+9xjjsSiIuQINBGGbfsABEADXZ0mAMCsf7Y2yJ2aSBLCVPUEZESeB+kDA +ciksAguHeMVp7owI2mfAfw4Z8TEQE6aOb+cqbDbqneQ6nuaRxMNvlDhhtrKztt+U +xWrvgPMvJ59/O1ujY7+VZ+3lXJ01r0u5Xdk1zUZ1uYYl8+Lt3/WmJmmlosX22EwQ +dsAvuRuOmuM2jPTMVN81TaZ2R0Om5vT2L8/SQY7csxc5OylBLskuDQmU1PNlkwTl +tCMqHl7g08LPADLGi/3C9I/0P92A+jZ9RgDCI8BIOS5Ny7CStT96dqrje6TLbWRx +IWBjn1xsVUTxYQY4FXYfw1p7Rm5kzZGImzI4Z7wNcVHCty+X9La+7yKwABW2BeTq +P0rCXpKmy4nNOgwTDVRaA5CL8mg2doK8HOoa03SgziPGEeHHFyS7I2DJAEw+h9K8 +jXVy9Tgd7ZSXmO4Z99ovdHksX4fmD7iOWU2WRp3b2yhpO8b/TgU2r9AQaNMAOsud +N+0Q9NGkEYhWRdWsGapJpa3Q+v1s88lodPhjcIkqGUCH82T/+MhTDdotssj8a6ue +g3mcwVD38690j8ngoBCPmLCNtgZDqhtfy+905uO3ksonF/X2lFc/YG7sqBiOHoDK +yN347pPyK8KbukiMoOasDIQUsryBUeAM0M0Xzftc66BaPgH75KTPZqkOO8aTBCRx +l/AHTejV5wARAQABiQI2BBgBCAAgFiEEOp6hGTqXtUi+FFfUiRn2vStI11QFAmGb +fsACGwwACgkQiRn2vStI11RimQ//a0dHoSekuMLDPEYpQYEEr5JYYWFdFPA/ixeM +HZ566ankWMqKHynhbqqRXVhqKd0pveV+ALhgrUU1nBB9Ma5P51VNEGC0rThiLBVk +CTrV0UrUXxxgA5cTJBVZe7yyysqdCHcuuNNrlQbuzhkJWBySgeWrbmd4VUbDdL0i +GMFSAtbrYjSWreBDBgWBYd8uRiPHtc6ACBIhzGGOulw0k4NnMXHDJyCwEv7EAxES +Y/V8kLK3p7DXc6tpmd1vZ38X52CnVRM6aLQl6YWKXhA+vp9cPJehfEbK+ZPErbCB +4jF6AC4BFwntE0YSod6TVM+wJBuPplnWr4nUUhsk/FeulWGF+AhUG2XJiIn90ZHI +cxlct8Nt1zBgoquxWYmqt/z+s7TOBXnvotOMJggjgsPozFlHjoy47Xc14VwzxnU+ +NknXYqDAsTfjgXiw7NRQdJ8BUd4TSf8iINf2uCNzp4QDJ37VfN7/BiaXBGjEEN+k +OKpxkse6qm06GFd+bXGSHh6H7z2d973k2QIfW03opWAwS+AdH8xmUUaAyyhEql5m +lXZVkQRS8rEO4IV5HQV8FD5iV9bdNihaydlKP09/D9ZDlie0dJZXOIHQDuwDt4Ab +3reSs1T5utClQE0FS8ZZjPuitq/l+TsT83TASVynvEmwvNiEbpOsNjvB1u++w4ni +qCHQv3Y= +=AKGw +-----END PGP PUBLIC KEY BLOCK----- diff --git a/ansible/roles/monitoring/templates/clickhouse.sources b/ansible/roles/monitoring/templates/clickhouse.sources new file mode 100644 index 00000000..db4eda17 --- /dev/null +++ b/ansible/roles/monitoring/templates/clickhouse.sources @@ -0,0 +1,7 @@ +Architectures: amd64 +Suites: stable +Uris: https://packages.clickhouse.com/deb +Types: deb +Components: main +Enabled: yes +Signed-By: /etc/apt/clickhouse.asc diff --git a/ansible/roles/monitoring/templates/etc_default_prometheus b/ansible/roles/monitoring/templates/etc_default_prometheus new file mode 100644 index 00000000..5ba9e6c2 --- /dev/null +++ b/ansible/roles/monitoring/templates/etc_default_prometheus @@ -0,0 +1,4 @@ +# Managed by ansible, see +# monitoring/templates/etc_default_prometheus +ARGS="" +GODEBUG=x509ignoreCN=0 diff --git a/ansible/roles/monitoring/templates/grafana.gpg b/ansible/roles/monitoring/templates/grafana.gpg new file mode 100644 index 00000000..35771a23 --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana.gpg @@ -0,0 +1,41 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mQGNBGTnhmkBDADUE+SzjRRyitIm1siGxiHlIlnn6KO4C4GfEuV+PNzqxvwYO+1r +mcKlGDU0ugo8ohXruAOC77Kwc4keVGNU89BeHvrYbIftz/yxEneuPsCbGnbDMIyC +k44UOetRtV9/59Gj5YjNqnsZCr+e5D/JfrHUJTTwKLv88A9eHKxskrlZr7Un7j3i +Ef3NChlOh2Zk9Wfk8IhAqMMTferU4iTIhQk+5fanShtXIuzBaxU3lkzFSG7VuAH4 +CBLPWitKRMn5oqXUE0FZbRYL/6Qz0Gt6YCJsZbaQ3Am7FCwWCp9+ZHbR9yU+bkK0 +Dts4PNx4Wr9CktHIvbypT4Lk2oJEPWjcCJQHqpPQZXbnclXRlK5Ea0NVpaQdGK+v +JS4HGxFFjSkvTKAZYgwOk93qlpFeDML3TuSgWxuw4NIDitvewudnaWzfl9tDIoVS +Bb16nwJ8bMDzovC/RBE14rRKYtMLmBsRzGYHWd0NnX+FitAS9uURHuFxghv9GFPh +eTaXvc4glM94HBUAEQEAAbQmR3JhZmFuYSBMYWJzIDxlbmdpbmVlcmluZ0BncmFm +YW5hLmNvbT6JAdQEEwEKAD4WIQS1Oud7rbYwpoMEYAWWP6J3EEWFRQUCZOeGaQIb +AwUJA8JnAAULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAKCRCWP6J3EEWFRUiADACa +i+xytv2keEFJWjXNnFAx6/obnHRcXOI3w6nH/zL8gNI7YN5jcdQT2NYvKVYTb3fW +GuMsjHWgat5Gq3AtJrOKABpZ6qeYNPk0Axn/dKtOTwXjZ4pKX3bbUYvVfs0fCEZv +B0HHIj2wI9kgMpoTrkj22LE8layZTPOoQ+3/FbLzS8hN3CYZj25mHN7bpZq8EbV3 +8FW9EU0HM0tg6CvoxkRiVqAuAC0KnVIZAdhD4dlYKuncq64nMvT1A5wxSYbnE+uf +mnWQQhhS6BOwRqN054yw1FrWNDFsvnOSHmr8dIiriv+aZYvx5JQFJ7oZP3LwdYyg +ocQcAJA8HFTIk3P6uJiIF/zdDzocgdKs+IYDoId0hxX7sGCvqdrsveq8n3m7uQiN +7FvSiV0eXIdV4F7340kc8EKiYwpuYSaZX0UWKLenzlUvD+W4pZCWtoXzPsW7PKUt +q1xdW0+NY+AGLCvSJCc5F4S5kFCObfBAYBbldjwwJFocdq/YOvvWYTPyV7kJeJS5 +AY0EZOeGaQEMALNIFUricEIwtZiX7vSDjwxobbqPKqzdek8x3ud0CyYlrbGHy0k+ +FDEXstjJQQ1s9rjJSu3sv5wyg9GDAUH3nzO976n/ZZvKPti3p2XU2UFx5gYkaaFV +D56yYxqGY0YU5ft6BG+RUz3iEPg3UBUzt0sCIYnG9+CsDqGOnRYIIa46fu2/H9Vu +8JvvSq9xbsK9CfoQDkIcoQOixPuI4P7eHtswCeYR/1LUTWEnYQWsBCf57cEpzR6t +7mlQnzQo9z4i/kp4S0ybDB77wnn+isMADOS+/VpXO+M7Zj5tpfJ6PkKch3SGXdUy +3zht8luFOYpJr2lVzp7n3NwB4zW08RptTzTgFAaW/NH2JjYI+rDvQm4jNs08Dtsp +nm4OQvBA9Df/6qwMEOZ9i10ixqk+55UpQFJ3nf4uKlSUM7bKXXVcD/odq804Y/K4 +y3csE059YVIyaPexEvYSYlHE2odJWRg2Q1VehmrOSC8Qps3xpU7dTHXD74ZpaYbr +haViRS5v/lCsiwARAQABiQG8BBgBCgAmFiEEtTrne622MKaDBGAFlj+idxBFhUUF +AmTnhmkCGwwFCQPCZwAACgkQlj+idxBFhUUNbQv8DCcfi3GbWfvp9pfY0EJuoFJX +LNgci7z7smXq7aqDp2huYQ+MulnPAydjRCVW2fkHItF2Ks6l+2/8t5Xz0eesGxST +xTyR31ARENMXaq78Lq+itZ+usOSDNuwJcEmJM6CceNMLs4uFkX2GRYhchkry7P0C +lkLxUTiB43ooi+CqILtlNxH7kM1O4Ncs6UGZMXf2IiG9s3JDCsYVPkC5QDMOPkTy +2ZriF56uPerlJveF0dC61RZ6RlM3iSJ9Fwvea0Oy4rwkCcs5SHuwoDTFyxiyz0QC +9iqi3fG3iSbLvY9UtJ6X+BtDqdXLAT9Pq527mukPP3LwpEqFVyNQKnGLdLOu2YXc +TWWWseSQkHRzBmjD18KTD74mg4aXxEabyT4snrXpi5+UGLT4KXGV5syQO6Lc0OGw +9O/0qAIU+YW7ojbKv8fr+NB31TGhGYWASjYlN1NvPotRAK6339O0/Rqr9xGgy3AY +SR+ic2Y610IM7xccKuTVAW9UofKQwJZChqae9VVZ +=J9CI +-----END PGP PUBLIC KEY BLOCK----- diff --git a/ansible/roles/monitoring/templates/grafana.list b/ansible/roles/monitoring/templates/grafana.list new file mode 100644 index 00000000..adbad20b --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana.list @@ -0,0 +1 @@ +deb https://packages.grafana.com/oss/deb stable main diff --git a/ansible/roles/monitoring/templates/grafana.sources b/ansible/roles/monitoring/templates/grafana.sources new file mode 100644 index 00000000..dd17e11d --- /dev/null +++ b/ansible/roles/monitoring/templates/grafana.sources @@ -0,0 +1,7 @@ +Architectures: amd64 +Suites: stable +Uris: https://apt.grafana.com +Types: deb +Components: main +Enabled: yes +Signed-By: /etc/apt/grafana.asc diff --git a/ansible/roles/monitoring/templates/htpasswd b/ansible/roles/monitoring/templates/htpasswd new file mode 100644 index 00000000..3c18c804 --- /dev/null +++ b/ansible/roles/monitoring/templates/htpasswd @@ -0,0 +1,5 @@ +# ansible-managed in ooni/sysadmin.git +# Username should be taken from @openobservatory.org domain +# Password should be generated with scripts/ngx-mkpasswd +# don't remove federico, because it will break monitoring +{{ monitoring_htpasswd }} diff --git a/ansible/roles/monitoring/templates/jupyter.service b/ansible/roles/monitoring/templates/jupyter.service new file mode 100644 index 00000000..d8c1dc97 --- /dev/null +++ b/ansible/roles/monitoring/templates/jupyter.service @@ -0,0 +1,37 @@ +[Unit] +Description=Jupyter +After=network.target + +[Service] +Type=simple +ExecStart=/usr/bin/jupyter-notebook --no-browser +Environment=JUPYTER_CONFIG_DIR=/var/lib/jupyter/conf +Environment=JUPYTER_DATA_DIR=/var/lib/jupyter/data +Environment=JUPYTER_RUNTIME_DIR=/var/lib/jupyter/run +Restart=on-failure +# DynamicUser=yes implies ProtectSystem=strict ProtectHome=read-only PrivateTmp, RemoveIPC, NoNewPrivileges, RestrictSUIDSGID +DynamicUser=yes +PrivateDevices=yes +PrivateMounts=yes +PrivateUsers=yes +DevicePolicy=closed +ProtectHostname=yes +ProtectHome=yes +ProtectControlGroups=yes +ProtectKernelModules=yes +ProtectKernelTunables=yes +RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6 AF_NETLINK +RestrictNamespaces=yes +RestrictRealtime=yes +MemoryDenyWriteExecute=yes +LockPersonality=yes + +CacheDirectory=jupyter +ReadWriteDirectories=/var/lib/jupyter +RuntimeDirectory=jupyter +StateDirectory=jupyter +WorkingDirectory=/var/lib/jupyter +WorkingDirectory=/var/lib/jupyter + +[Install] +WantedBy=multi-user.target diff --git a/ansible/roles/monitoring/templates/nginx.conf b/ansible/roles/monitoring/templates/nginx.conf new file mode 100644 index 00000000..c383297d --- /dev/null +++ b/ansible/roles/monitoring/templates/nginx.conf @@ -0,0 +1,203 @@ +# Managed by ansible +# roles/ooni-backend/monitoring/nginx.conf + +# Grafana +map $http_upgrade $connection_upgrade { + default upgrade; + '' close; +} +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name grafana.ooni.org; + access_log syslog:server=unix:/dev/log,severity=info; + error_log syslog:server=unix:/dev/log,severity=info; + gzip on; + + ssl_certificate /var/lib/dehydrated/certs/grafana.ooni.org/fullchain.pem; + ssl_certificate_key /var/lib/dehydrated/certs/grafana.ooni.org/privkey.pem; + ssl_trusted_certificate /var/lib/dehydrated/certs/grafana.ooni.org/chain.pem; + + # Use the intermediate configuration to support legacy probes + # https://ssl-config.mozilla.org/#server=nginx&version=1.14.2&config=intermediate&openssl=1.1.1d&guideline=5.6 + ssl_session_timeout 5m; + ssl_session_cache shared:MozSSL:30m; + ssl_session_tickets off; + + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + add_header Strict-Transport-Security "max-age=63072000" always; + ssl_stapling on; + ssl_stapling_verify on; + + resolver 127.0.0.1; + + # Grafana uses its own authentication + + location / { + proxy_pass http://localhost:3000; + # do not forward the basic auth header to grafana + proxy_set_header Authorization ""; + proxy_set_header Host $http_host; + } + + # Proxy Grafana Live WebSocket connections. + location /api/live { + rewrite ^/(.*) /$1 break; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "Upgrade"; + proxy_set_header Host $http_host; + proxy_pass http://localhost:3000/; + } +} + +# Netdata +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name netdata.ooni.org; + access_log syslog:server=unix:/dev/log,severity=info; + error_log syslog:server=unix:/dev/log,severity=info; + gzip on; + + ssl_certificate /var/lib/dehydrated/certs/netdata.ooni.org/fullchain.pem; + ssl_certificate_key /var/lib/dehydrated/certs/netdata.ooni.org/privkey.pem; + ssl_trusted_certificate /var/lib/dehydrated/certs/netdata.ooni.org/chain.pem; + + # Use the intermediate configuration to support legacy probes + # https://ssl-config.mozilla.org/#server=nginx&version=1.14.2&config=intermediate&openssl=1.1.1d&guideline=5.6 + ssl_session_timeout 5m; + ssl_session_cache shared:MozSSL:30m; + ssl_session_tickets off; + + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + + # HSTS (ngx_http_headers_module is required) (63072000 seconds) + add_header Strict-Transport-Security "max-age=63072000" always; + + # OCSP stapling + ssl_stapling on; + ssl_stapling_verify on; + + # verify chain of trust of OCSP response using Root CA and Intermediate certs + #ssl_trusted_certificate /path/to/root_CA_cert_plus_intermediates; + + resolver 127.0.0.1; + + location /{ + proxy_set_header Host $host; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Forwarded-Server $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_http_version 1.1; + proxy_pass_request_headers on; + proxy_set_header Connection "keep-alive"; + proxy_store off; + proxy_pass http://127.0.0.1:19999/; + auth_basic "OONI Monitoring"; + auth_basic_user_file /etc/nginx/monitoring.htpasswd; + gzip on; + gzip_proxied any; + gzip_types *; + } +} + +# Prometheus +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name prometheus.ooni.org; + access_log syslog:server=unix:/dev/log,severity=info; + error_log syslog:server=unix:/dev/log,severity=info; + gzip on; + + ssl_certificate /var/lib/dehydrated/certs/prometheus.ooni.org/fullchain.pem; + ssl_certificate_key /var/lib/dehydrated/certs/prometheus.ooni.org/privkey.pem; + ssl_trusted_certificate /var/lib/dehydrated/certs/prometheus.ooni.org/chain.pem; + + # Use the intermediate configuration to support legacy probes + # https://ssl-config.mozilla.org/#server=nginx&version=1.14.2&config=intermediate&openssl=1.1.1d&guideline=5.6 + ssl_session_timeout 5m; + ssl_session_cache shared:MozSSL:30m; + ssl_session_tickets off; + + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + + # HSTS (ngx_http_headers_module is required) (63072000 seconds) + add_header Strict-Transport-Security "max-age=63072000" always; + + # OCSP stapling + ssl_stapling on; + ssl_stapling_verify on; + + # verify chain of trust of OCSP response using Root CA and Intermediate certs + #ssl_trusted_certificate /path/to/root_CA_cert_plus_intermediates; + + resolver 127.0.0.1; + location / { + proxy_pass http://127.0.0.1:9090; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + auth_basic "OONI Monitoring"; + auth_basic_user_file /etc/nginx/monitoring.htpasswd; + } +} + +# Jupyter +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name jupyter.ooni.org; + access_log syslog:server=unix:/dev/log,severity=info; + error_log syslog:server=unix:/dev/log,severity=info; + gzip on; + + ssl_certificate /var/lib/dehydrated/certs/jupyter.ooni.org/fullchain.pem; + ssl_certificate_key /var/lib/dehydrated/certs/jupyter.ooni.org/privkey.pem; + ssl_trusted_certificate /var/lib/dehydrated/certs/jupyter.ooni.org/chain.pem; + + # Use the intermediate configuration to support legacy probes + # https://ssl-config.mozilla.org/#server=nginx&version=1.14.2&config=intermediate&openssl=1.1.1d&guideline=5.6 + ssl_session_timeout 5m; + ssl_session_cache shared:MozSSL:30m; + ssl_session_tickets off; + + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + + # HSTS (ngx_http_headers_module is required) (63072000 seconds) + add_header Strict-Transport-Security "max-age=63072000" always; + + # OCSP stapling + ssl_stapling on; + ssl_stapling_verify on; + + # verify chain of trust of OCSP response using Root CA and Intermediate certs + #ssl_trusted_certificate /path/to/root_CA_cert_plus_intermediates; + + resolver 127.0.0.1; + + location / { + proxy_pass http://127.0.0.1:8888; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header Host $http_host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + auth_basic "OONI Monitoring"; + auth_basic_user_file /etc/nginx/monitoring.htpasswd; + + # websocket headers + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + proxy_set_header X-Scheme $scheme; + + proxy_buffering off; + } +} diff --git a/ansible/roles/monitoring/templates/vector.list b/ansible/roles/monitoring/templates/vector.list new file mode 100644 index 00000000..1b56727a --- /dev/null +++ b/ansible/roles/monitoring/templates/vector.list @@ -0,0 +1,2 @@ +# See ansible/roles/monitoring/tasks/main.yml +deb https://repositories.timber.io/public/vector/deb/debian bullseye main diff --git a/ansible/roles/monitoring/templates/vector.toml b/ansible/roles/monitoring/templates/vector.toml new file mode 100644 index 00000000..1b236589 --- /dev/null +++ b/ansible/roles/monitoring/templates/vector.toml @@ -0,0 +1,24 @@ +# Managed by ansible + +[sources.local_journald] +type = "journald" +exclude_units = [ "clickhouse" ] + +[sources.remote_vector] +type = "vector" +address = "0.0.0.0:10514" + +tls.enabled = true +tls.verify_certificate = true +tls.verify_hostname = false +tls.ca_file = "/etc/vector/oonicacert.pem" +tls.crt_file = "/etc/vector/node-cert.pem" +tls.key_file = "/etc/vector/node.key" + +[sinks.clickhouse_sink] +type = "clickhouse" +inputs = [ "local_journald", "remote_vector" ] +database = "default" +endpoint = "http://localhost:8123" +table = "logs" +date_time_best_effort = true diff --git a/ansible/roles/nftables/defaults/main.yml b/ansible/roles/nftables/defaults/main.yml new file mode 100644 index 00000000..1f9e51f1 --- /dev/null +++ b/ansible/roles/nftables/defaults/main.yml @@ -0,0 +1,10 @@ +nft_rules_tcp: + - name: 22 + rules: + - add rule inet filter input tcp dport 22 counter accept comment "Incoming SSH" + #- name: 80 + # rules: + # - add rule inet filter input tcp dport 80 counter accept comment "incoming HTTP" + #- name: 443 + # rules: + # - add rule inet filter input tcp dport 443 counter accept comment "incoming HTTPS" diff --git a/ansible/roles/nftables/tasks/main.yml b/ansible/roles/nftables/tasks/main.yml index 2789b150..5946772b 100644 --- a/ansible/roles/nftables/tasks/main.yml +++ b/ansible/roles/nftables/tasks/main.yml @@ -16,12 +16,15 @@ tags: - nftables -- name: allow SSH - ansible.builtin.blockinfile: - path: /etc/ooni/nftables/tcp/22.nft - create: yes - block: | - add rule inet filter input tcp dport 22 counter accept comment "Incoming SSH" +- name: "write nft config for item" + ansible.builtin.template: + src: "rule.nft.j2" + dest: "/etc/ooni/nftables/tcp/{{ item.name }}.nft" + vars: + rules: "{{ item.rules }}" + loop: "{{ nft_rules_tcp }}" + notify: + - Reload nftables tags: - nftables diff --git a/ansible/roles/nftables/templates/nftables.conf b/ansible/roles/nftables/templates/nftables.conf index 5f7b50cc..0d94f9ca 100755 --- a/ansible/roles/nftables/templates/nftables.conf +++ b/ansible/roles/nftables/templates/nftables.conf @@ -38,4 +38,3 @@ include "/etc/ooni/nftables/tcp/*.nft" # Configure any other rule include "/etc/ooni/nftables/*.nft" - diff --git a/ansible/roles/nftables/templates/rule.nft.j2 b/ansible/roles/nftables/templates/rule.nft.j2 new file mode 100644 index 00000000..093c8baa --- /dev/null +++ b/ansible/roles/nftables/templates/rule.nft.j2 @@ -0,0 +1,4 @@ +{{ ansible_managed | comment }} +{% for entry in rules %} +{{ entry }} +{% endfor %} diff --git a/ansible/roles/nginx/defaults/main.yml b/ansible/roles/nginx/defaults/main.yml new file mode 100644 index 00000000..4c0ac11a --- /dev/null +++ b/ansible/roles/nginx/defaults/main.yml @@ -0,0 +1 @@ +nginx_user: nginx diff --git a/ansible/roles/nginx/tasks/main.yml b/ansible/roles/nginx/tasks/main.yml index b93304c1..9af2a9b4 100644 --- a/ansible/roles/nginx/tasks/main.yml +++ b/ansible/roles/nginx/tasks/main.yml @@ -1,4 +1,18 @@ --- +- ansible.builtin.include_role: + name: nftables + vars: + nft_rules_tcp: + - name: 80 + rules: + - add rule inet filter input tcp dport 80 counter accept comment "incoming HTTP" + - name: 443 + rules: + - add rule inet filter input tcp dport 443 counter accept comment "incoming HTTPS" + tags: + - nginx + - nftables + - name: install nginx include_role: name: nginxinc.nginx diff --git a/ansible/roles/nginx/templates/nginx.conf b/ansible/roles/nginx/templates/nginx.conf index f43bf7c5..7b1b594c 100644 --- a/ansible/roles/nginx/templates/nginx.conf +++ b/ansible/roles/nginx/templates/nginx.conf @@ -1,122 +1,61 @@ -# NB: system nginx uses `www-data` user! -user nginx; -worker_processes 2; +# Managed by ansible +# roles/nginx/templates/nginx.conf +# -error_log /var/log/nginx/error.log warn; -pid /var/run/nginx.pid; +user {{ nginx_user }}; +worker_processes auto; +pid /run/nginx.pid; +include /etc/nginx/modules-enabled/*.conf; events { - worker_connections 1024; + worker_connections 768; + # multi_accept on; } http { - include /etc/nginx/mime.types; - default_type application/octet-stream; - - geo $is_ooni { - # TODO: this is not implemented ATM - default 0; - } - - map $http_x_request_id $has_request_id { # check for `X-Request-ID` - "" 0; - default 1; - } - - map "$is_ooni:$has_request_id" $ooni_request_id { - "1:1" $http_x_request_id; # use `X-Request-ID` if it's okay - default $request_id; - } - - # IPv4 is anonymized to /24, IPv6 to /48 - according to OONI Data Policy. - # https://ooni.torproject.org/about/data-policy/ - # IP is recorded to track possible abusers, not to distinguish users, so the - # address is truncated down to ISP (min routable prefix) instead of hashing. - map $remote_addr $ooni_remote_addr { - default "0.0.0.0"; - # variables in map value require nginx/1.11.0+ - "~(?P\d+\.\d+\.\d+)\.\d+" "$ip.0"; - # :: means at least TWO zero 16bit fields, https://tools.ietf.org/html/rfc5952#section-4.2.2 - "~(?P[0-9a-f]+:[0-9a-f]+:[0-9a-f]+):[0-9a-f:]+" "$ip::"; - "~(?P[0-9a-f]+:[0-9a-f]+)::[0-9a-f:]+" "$ip::"; - "~(?P[0-9a-f]+)::[0-9a-f:]+" "$ip::"; - } - - # $server_name is important as mtail does not distinguish log lines from - # different files, $host is required to log actual `Host` header. - # $request is split into separate fields to ease awk and mtail parsing. - # $scheme is used instead of $https to ease eye-reading. - # TCP_INFO is logged for random fun. - log_format mtail_pub - '$time_iso8601\t$msec\t$server_name\t' - '$ooni_remote_addr\t' # pub/int diff - '$request_completion\t$request_time\t$status\t$bytes_sent\t$body_bytes_sent\t' - '$upstream_cache_status\t$upstream_addr\t$upstream_status\t$upstream_connect_time\t$upstream_header_time\t$upstream_response_time\t' - '$scheme\t$server_protocol\t$request_length\t$request_method\t$host\t$request_uri\t' - '$tcpinfo_rtt\t$tcpinfo_rttvar\t' - '$http_referer\t$http_user_agent\t$ooni_request_id'; - - log_format mtail_int - '$time_iso8601\t$msec\t$server_name\t' - '$remote_addr\t' # pub/int diff - '$request_completion\t$request_time\t$status\t$bytes_sent\t$body_bytes_sent\t' - '$upstream_cache_status\t$upstream_addr\t$upstream_status\t$upstream_connect_time\t$upstream_header_time\t$upstream_response_time\t' - '$scheme\t$server_protocol\t$request_length\t$request_method\t$host\t$request_uri\t' - '$tcpinfo_rtt\t$tcpinfo_rttvar\t' - '$http_referer\t$http_user_agent\t$ooni_request_id'; - - log_format oolog '$ooni_remote_addr - $remote_user [$time_local] ' - '"$request" $status $body_bytes_sent ' - '"$http_referer" "$http_user_agent" "$host"'; - - log_format oolog_mtail '$time_iso8601\t$msec\t$server_name\t' - '$ooni_remote_addr\t' # pub/int diff - '$request_completion\t$request_time\t$status\t$bytes_sent\t$body_bytes_sent\t' - '$upstream_cache_status\t$upstream_addr\t$upstream_status\t$upstream_connect_time\t$upstream_header_time\t$upstream_response_time\t' - '$scheme\t$server_protocol\t$request_length\t$request_method\t$host\t$request_uri\t' - '$tcpinfo_rtt\t$tcpinfo_rttvar\t' - '$http_referer\t$http_user_agent\t$ooni_request_id'; - - access_log /var/log/nginx/access.log mtail_int; - - sendfile on; - tcp_nopush on; # TCP_CORK HTTP headers with sendfile() body into single packet - - keepalive_timeout 120 120; # Firefox has 115s, http://kb.mozillazine.org/Network.http.keep-alive.timeout - - server_tokens off; - - # SSL based on https://wiki.mozilla.org/Security/Server_Side_TLS (doc v4.1) - ssl_session_timeout 1d; - ssl_session_cache shared:GLOBAL:1m; # 1m of cache is ~4000 sessions - ssl_session_tickets off; # needs accurate key rotation - ssl_dhparam /etc/nginx/ffdhe2048_dhparam.pem; # https://tools.ietf.org/html/rfc7919 - ssl_prefer_server_ciphers on; - #TODO: ssl_stapling on; # needs `resolver` or `ssl_stapling_file` - #TODO: ssl_stapling_verify on; # needs `ssl_trusted_certificate` - #TODO: resolver ; - # Define in server{} - # - include /etc/nginx/ssl_modern.conf | /etc/nginx/ssl_intermediate.conf - # - ssl_certificate /etc/letsencrypt/live/example.org/fullchain.pem; - # - ssl_certificate_key /etc/letsencrypt/live/example.org/privkey.pem - # - ssl_trusted_certificate /etc/letsencrypt/live/example.org/chain.pem; # for ssl_stapling_verify - # - add_header Strict-Transport-Security max-age=15768000; # HSTS (15768000 seconds = 6 months) - ### - - gzip on; - gzip_types text/html text/plain text/css text/xml text/javascript application/x-javascript application/json application/xml; # default is only `text/html` - gzip_disable "msie6"; - #gzip_proxied any; - - # Host, X-Real-IP, X-Forwarded-For, X-Forwarded-Proto are from - # file /etc/nginx/proxy_params from nginx-common package - # NB: adding `proxy_set_header` in another location overwrites whole set! - proxy_set_header Host $http_host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_set_header X-Request-ID $ooni_request_id; - - include /etc/nginx/conf.d/*.conf; - include /etc/nginx/sites-enabled/*; + + # Basic Settings + + sendfile on; + tcp_nopush on; # TCP_CORK HTTP headers with sendfile() body into single packet + types_hash_max_size 2048; + # server_tokens off; + + # server_names_hash_bucket_size 64; + # server_name_in_redirect off; + + include /etc/nginx/mime.types; + default_type application/octet-stream; + + # Logging Settings + + # anonymize ipaddr + map $remote_addr $remote_addr_anon { + ~(?P\d+\.\d+\.\d+)\. $ip.0; + ~(?P[^:]+:[^:]+): $ip::; + default 0.0.0.0; + } + + # log anonymized ipaddr and caching status + log_format ooni_nginx_fmt '$remote_addr_anon $upstream_cache_status [$time_local] ' + '"$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"'; + + access_log syslog:server=unix:/dev/log ooni_nginx_fmt; + error_log syslog:server=unix:/dev/log; + + # Gzip Settings + + gzip on; + + # gzip_vary on; + # gzip_proxied any; + # gzip_comp_level 6; + # gzip_buffers 16 8k; + # gzip_http_version 1.1; + # gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss text/javascript; + + # Virtual Host Configs + + include /etc/nginx/conf.d/*.conf; + include /etc/nginx/sites-enabled/*; } diff --git a/ansible/roles/notify-slack/tasks/main.yml b/ansible/roles/notify-slack/tasks/main.yml new file mode 100644 index 00000000..83290d00 --- /dev/null +++ b/ansible/roles/notify-slack/tasks/main.yml @@ -0,0 +1,8 @@ +--- +- name: "notify #ooni-bots at openobservatory.slack.com" + slack: + token: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ansible_slack_token', profile='oonidevops_user_prod') }}" + channel: "#ooni-bots" + msg: "{{ lookup('template', 'notify-slack.j2') }}" + delegate_to: localhost + run_once: true diff --git a/ansible/roles/notify-slack/templates/notify-slack.j2 b/ansible/roles/notify-slack/templates/notify-slack.j2 new file mode 100644 index 00000000..6f43d7dc --- /dev/null +++ b/ansible/roles/notify-slack/templates/notify-slack.j2 @@ -0,0 +1,10 @@ +{% set cleanness = 'dirty' if lookup('pipe', 'git status -s') else 'clean' %} +{% set head = lookup('pipe', 'git rev-parse HEAD') %} +{% set head7 = lookup('pipe', 'git rev-parse --short HEAD') %} +{% set branch = lookup('pipe', 'git describe --always --exact-match --all HEAD') | replace('heads/', '') %} +{% set user_slug = lookup('pipe', 'whoami') + '@' + lookup('pipe', 'hostname') %} +{% if lookup('file', '~/.ooni-sysadmin/user_slug', errors='ignore') %} +{% set user_slug = lookup('file', '~/.ooni-sysadmin/user_slug') %} +{% endif %} + +{{ user_slug }} runs `{{ lookup('env', 'ANSIBLE_SLACK_CMD') }}` on top of {{ cleanness }} `` (`{{ branch }}`) diff --git a/ansible/roles/ooni-backend/handlers/main.yml b/ansible/roles/ooni-backend/handlers/main.yml new file mode 100644 index 00000000..84d0f4f1 --- /dev/null +++ b/ansible/roles/ooni-backend/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: reload nftables + service: name=nftables state=reloaded + +- name: restart clickhouse + service: name=clickhouse-server state=restarted diff --git a/ansible/roles/ooni-backend/meta/main.yml b/ansible/roles/ooni-backend/meta/main.yml new file mode 100644 index 00000000..c82f9e2d --- /dev/null +++ b/ansible/roles/ooni-backend/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - role: nftables diff --git a/ansible/roles/ooni-backend/tasks/main.yml b/ansible/roles/ooni-backend/tasks/main.yml new file mode 100644 index 00000000..a6ee12d6 --- /dev/null +++ b/ansible/roles/ooni-backend/tasks/main.yml @@ -0,0 +1,697 @@ +--- + +## API ## + +- name: install API if not present + # do not update package if present + tags: api + apt: + cache_valid_time: '{{ apt_cache_valid_time }}' + name: ooni-api + state: present + update_cache: yes + +- name: create Nginx cache dir + file: + path: /var/cache/nginx/ooni-api + state: directory + +- name: configure test api + when: inventory_hostname == 'backend-hel.ooni.org' + tags: api + template: + src: api.conf + dest: /etc/ooni/api.conf + owner: ooniapi + group: ooniapi + mode: 0640 + vars: + collectors: [] + # bucket_name and collector_id must match the uploader + collector_id: 2 + bucket_name: ooni-data-eu-fra-test + github_push_repo: "ooni-bot/test-lists" + github_origin_repo: "ooni/test-lists" + login_base_url: "https://test-lists.test.ooni.org/login" + pg_uri: "" + clickhouse_url: clickhouse://api:api@localhost/default + # mail_smtp_password: "DISABLED" + # jwt_encryption_key and account_id_hashing_key are taken from the vault + +- name: configure backend-fsn api + when: inventory_hostname == 'backend-fsn.ooni.org' + tags: api + template: + src: api.conf + dest: /etc/ooni/api.conf + owner: ooniapi + group: ooniapi + mode: 0640 + vars: + collectors: ['backend-fsn.ooni.org'] + # bucket_name and collector_id must match the uploader + collector_id: 1 + bucket_name: ooni-data-eu-fra + github_push_repo: "ooni/test-lists" + github_origin_repo: "citizenlab/test-lists" + login_base_url: "https://test-lists.ooni.org/login" + pg_uri: "" + clickhouse_url: clickhouse://api:api@localhost/default + base_url: "https://api.ooni.io" + +- name: create Psiphon conffile + tags: api + copy: + content: "{{ psiphon_config }}" + dest: /etc/ooni/psiphon_config.json + +- name: Write Tor targets conffile + tags: api + template: + src: tor_targets.json + dest: /etc/ooni/tor_targets.json + +- name: configure api uploader using test bucket + when: inventory_hostname == 'backend-hel.ooni.org' + tags: api + template: + src: templates/api-uploader.conf + dest: /etc/ooni/api-uploader.conf + vars: + # bucket_name and collector_id must match the API + bucket_name: ooni-data-eu-fra-test + collector_id: 2 + +- name: configure FSN api uploader using PROD bucket + when: inventory_hostname == 'backend-fsn.ooni.org' + tags: api + template: + src: templates/api-uploader.conf + dest: /etc/ooni/api-uploader.conf + vars: + # bucket_name and collector_id must match the API + bucket_name: ooni-data-eu-fra + collector_id: 1 + +## Haproxy and nginx ## + +- name: Overwrite API nginx test conf + when: inventory_hostname == 'backend-hel.ooni.org' + tags: api, webserv + template: + src: templates/nginx-api-test.conf + dest: /etc/nginx/sites-available/ooni-api.conf + mode: 0755 + owner: root + vars: + # Uses dehydrated + certpath: /var/lib/dehydrated/certs/ + +- name: install haproxy if not present + when: inventory_hostname in ('backend-hel.ooni.org') + tags: webserv + apt: + cache_valid_time: 86400 + name: haproxy + state: present + +- name: Deploy haproxy conf + when: inventory_hostname in ('backend-hel.ooni.org') + tags: api, webserv + template: + src: templates/haproxy.cfg + dest: /etc/haproxy/haproxy.cfg + mode: 0755 + owner: root + vars: + # Uses dehydrated + certpath: /var/lib/dehydrated/certs/ + +- name: Delete old files + when: inventory_hostname in ('backend-hel.ooni.org') + tags: api, webserv + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - /etc/nginx/sites-enabled/00-letsencrypt-http + - /etc/nginx/sites-enabled/deb_ooni_org + - /etc/nginx/sites-enabled/deb_ooni_org_http + +- name: Deploy dehydrated conf + when: inventory_hostname in ('backend-hel.ooni.org') + tags: api, webserv + template: + src: templates/dehydrated.config + dest: /etc/dehydrated/config + mode: 0755 + owner: root + +- name: Deploy dehydrated conf + when: inventory_hostname in ('backend-hel.ooni.org') + tags: api, webserv + template: + src: templates/dehydrated.config + dest: /etc/dehydrated/config + mode: 0755 + owner: root + +- name: Deploy dehydrated haproxy hook + when: inventory_hostname in ('backend-hel.ooni.org') + tags: api, webserv + template: + src: templates/dehydrated_haproxy_hook.sh + dest: /etc/dehydrated/haproxy_hook.sh + mode: 0755 + owner: root + +- name: Overwrite API nginx FSN conf + when: inventory_hostname == 'backend-fsn.ooni.org' + tags: api, webserv + template: + src: templates/nginx-api-fsn.conf + dest: /etc/nginx/sites-available/ooni-api.conf + mode: 0755 + owner: root + vars: + # Uses dehydrated + certpath: /var/lib/dehydrated/certs/ + +- name: Deploy API gunicorn conf + tags: api + template: + src: api.gunicorn.py + dest: /etc/ooni/api.gunicorn.py + owner: ooniapi + group: ooniapi + mode: 0640 + +- name: Create symlink for API nginx conf + tags: api + file: + src=/etc/nginx/sites-available/ooni-api.conf + dest=/etc/nginx/sites-enabled/ooni-api.conf + state=link + +- name: Configure deb.ooni.org forwarder on FSN host + when: inventory_hostname in ('backend-fsn.ooni.org', ) + tags: deb_ooni_org + # Uses dehydrated + template: + src: deb_ooni_org.nginx.conf + dest: /etc/nginx/sites-enabled/deb_ooni_org + +- name: Configure deb-ci.ooni.org forwarder on test host + when: inventory_hostname == 'backend-hel.ooni.org' + tags: deb_ooni_org + blockinfile: + path: /etc/nginx/sites-enabled/deb_ooni_org_http + create: yes + block: | + # Managed by ansible, see roles/ooni-backend/tasks/main.yml + server { + listen 80; + server_name deb-ci.ooni.org; + location / { + proxy_pass https://ooni-internal-deb.s3.eu-central-1.amazonaws.com/; + } + } + +- name: create badges dir + tags: api + file: + path: /var/www/package_badges/ + state: directory + +- name: Safely reload Nginx + # TODO remove restart after transition to haproxy + tags: api, deb_ooni_org, webserv + shell: nginx -t && systemctl reload nginx + +- name: Restart Nginx + tags: webserv + shell: nginx -t && systemctl restart nginx + +- name: Restart haproxy + # reload is not enough + when: inventory_hostname in ('backend-hel.ooni.org') + tags: api, deb_ooni_org, webserv + shell: systemctl restart haproxy + +- name: allow incoming TCP connections to API + tags: api + blockinfile: + path: /etc/ooni/nftables/tcp/443.nft + create: yes + block: | + add rule inet filter input tcp dport 443 counter accept comment "incoming HTTPS" + +- name: allow incoming TCP connections to haproxy metrics + tags: webserv + template: + src: 444.nft + dest: /etc/ooni/nftables/tcp/444.nft + +#- name: reload nftables service +# tags: api +# systemd: +# name: nftables.service +# state: reloaded + +- name: reload nftables service + tags: api, webserv + shell: systemctl reload nftables.service + + +## Fastpath ## + +- name: install fastpath if not present + # do not update package if present + when: inventory_hostname != 'backend-fsn.ooni.org' + tags: fastpath + apt: + cache_valid_time: 86400 + name: fastpath + state: present + +- name: configure fastpath on test + when: inventory_hostname == 'backend-hel.ooni.org' + tags: fastpath + template: + src: fastpath.conf + dest: /etc/ooni/fastpath.conf + owner: fastpath + group: fastpath + mode: 0640 + vars: + clickhouse_url: clickhouse://fastpath:fastpath@localhost/default + +- name: configure fastpath on FSN + when: inventory_hostname == 'backend-fsn.ooni.org' + tags: fastpath + template: + src: fastpath.conf + dest: /etc/ooni/fastpath.conf + owner: fastpath + group: fastpath + mode: 0640 + vars: + clickhouse_url: clickhouse://fastpath:fastpath@localhost/default + + + +## Event detector ## + +#- name: install detector +# tags: detector +# apt: +# cache_valid_time: 86400 +# name: detector +# +#- name: configure detector +# tags: detector +# blockinfile: +# path: /etc/ooni/detector.conf +# create: yes +# block: | +# # Managed by ansible, see roles/ooni-backend/tasks/main.yml + + +## Analysis daemon ## + +- name: install analysis + # do not update package if present + when: inventory_hostname != 'backend-fsn.ooni.org' + tags: analysis + apt: + cache_valid_time: 86400 + name: analysis=1.4~pr408-209 + force: True + state: present + +- name: configure analysis + tags: analysis-conf + template: + src: analysis.conf + dest: /etc/ooni/analysis.conf + # Managed by ansible, see roles/ooni-backend/tasks/main.yml + + +## Test helper rotation ## + +- name: configure test helper rotation + tags: rotation + when: inventory_hostname == 'backend-fsn.ooni.org' + blockinfile: + path: /etc/ooni/rotation.conf + create: yes + mode: 0400 + block: | + # Managed by ansible, see roles/ooni-backend/tasks/main.yml + [DEFAULT] + # Digital Ocean token + token = {{ digital_ocean_token }} + active_droplets_count = 4 + size_slug = s-1vcpu-1gb + image_name = debian-11-x64 + draining_time_minutes = 1440 + dns_zone = th.ooni.org + +- name: configure test helper rotation certbot + tags: rotation + when: inventory_hostname == 'backend-fsn.ooni.org' + blockinfile: + path: /etc/ooni/certbot-digitalocean + create: yes + mode: 0400 + block: | + # Managed by ansible, see roles/ooni-backend/tasks/main.yml + dns_digitalocean_token = {{ digital_ocean_token }} + +- name: configure test helper rotation setup script + tags: rotation + when: inventory_hostname == 'backend-fsn.ooni.org' + template: + src: rotation_setup.sh + dest: /etc/ooni/rotation_setup.sh + +- name: create test helper rotation nginx template + tags: rotation + when: inventory_hostname == 'backend-fsn.ooni.org' + template: + src: rotation_nginx_conf + dest: /etc/ooni/rotation_nginx_conf + +- name: generate test helper rotation SSH keypair + tags: rotation + when: inventory_hostname == 'backend-fsn.ooni.org' + openssh_keypair: + path: /etc/ooni/testhelper_ssh_key + owner: root + group: root + mode: 0400 + type: ed25519 + register: pubkey + +- name: print SSH pubkey + tags: rotation + when: inventory_hostname == 'backend-fsn.ooni.org' + debug: msg={{ pubkey.public_key }} + +- name: Enable and start rotation service + tags: rotation + when: inventory_hostname == 'backend-fsn.ooni.org' + systemd: + daemon_reload: yes + enabled: yes + name: ooni-rotation.timer + state: started + + +## Tor daemon and onion service ## + +## TODO(decfox): get rid of this? +- name: configure tor onion service hostname + when: inventory_hostname == 'ams-pg.ooni.org' + tags: tor + blockinfile: + path: /var/lib/tor/ooni_onion_service/hostname + create: yes + owner: debian-tor + group: debian-tor + mode: 0644 + block: guegdifjy7bjpequ.onion + +- name: configure tor onion service private_key + when: inventory_hostname == 'ams-pg.ooni.org' + tags: tor + blockinfile: + path: /var/lib/tor/ooni_onion_service/private_key + create: yes + owner: debian-tor + group: debian-tor + mode: 0600 + block: "{{ amspg_ooni_org_onion_key }}" + +- name: set tor onion service directory + when: inventory_hostname == 'ams-pg.ooni.org' + tags: tor + shell: | + chown debian-tor:debian-tor /var/lib/tor/ooni_onion_service + chmod 0700 /var/lib/tor/ooni_onion_service + + +# # Clickhouse # # + +- name: install APT HTTPS support + # do not update package if present + when: inventory_hostname in ('backend-fsn.ooni.org', 'backend-hel.ooni.org') + tags: clickhouse + apt: + cache_valid_time: 86400 + state: present + name: + - apt-transport-https + - ca-certificates + - dirmngr + +- name: install clickhouse keys + when: inventory_hostname in ('backend-fsn.ooni.org', 'backend-hel.ooni.org') + tags: clickhouse + command: apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754 + +- name: set clickhouse repos + when: inventory_hostname in ('backend-fsn.ooni.org', 'backend-hel.ooni.org') + tags: clickhouse + blockinfile: + path: /etc/apt/sources.list.d/clickhouse.list + create: yes + block: | + deb https://packages.clickhouse.com/deb lts main + +- name: pin clickhouse release train + when: inventory_hostname in ('backend-fsn.ooni.org', ) + tags: clickhouse + blockinfile: + path: /etc/apt/preferences.d/clickhouse-server + create: yes + block: | + Package: clickhouse-server + Pin: version 21.8.12.* + Pin-Priority: 999 + +- name: pin clickhouse release train + when: inventory_hostname in ('backend-hel.ooni.org') + tags: clickhouse + blockinfile: + path: /etc/apt/preferences.d/clickhouse-server + create: yes + block: | + Package: clickhouse-server + Pin: version 23.8.2.* + Pin-Priority: 999 + +- name: install clickhouse on backend-fsn + when: inventory_hostname == 'backend-fsn.ooni.org' + tags: clickhouse + apt: + # refresh cache + cache_valid_time: 0 + name: + - clickhouse-server={{ clickhouse_pkg_ver }} + - clickhouse-client={{ clickhouse_pkg_ver }} + - clickhouse-common-static={{ clickhouse_pkg_ver }} + vars: + clickhouse_pkg_ver: 21.8.12.* + +- name: install clickhouse on backend-hel.ooni.org + when: inventory_hostname == 'backend-hel.ooni.org' + tags: clickhouse + apt: + # refresh cache + cache_valid_time: 0 + name: + - clickhouse-server={{ clickhouse_pkg_ver }} + - clickhouse-client={{ clickhouse_pkg_ver }} + - clickhouse-common-static={{ clickhouse_pkg_ver }} + vars: + clickhouse_pkg_ver: 23.8.2.* + +- name: install clickhouse conf override + when: inventory_hostname in ('backend-fsn.ooni.org', 'backend-hel.ooni.org') + tags: clickhouse + template: + src: clickhouse_config.xml + dest: /etc/clickhouse-server/config.d/ooni_conf.xml + owner: clickhouse + group: clickhouse + mode: 0400 + notify: restart clickhouse + +- name: allow incoming TCP connections from monitoring to Clickhouse prometheus interface + when: inventory_hostname in ('backend-fsn.ooni.org', 'backend-hel.ooni.org') + tags: clickhouse + blockinfile: + path: /etc/ooni/nftables/tcp/9363.nft + create: yes + block: | + add rule inet filter input ip saddr 5.9.112.244 tcp dport 9363 counter accept comment "clickhouse prometheus from monitoring.ooni.org" + notify: reload nftables + +- name: allow incoming TCP connections from jupiter on monitoring.ooni.org to Clickhouse + when: inventory_hostname in ('backend-fsn.ooni.org', 'backend-hel.ooni.org') + tags: clickhouse + blockinfile: + path: /etc/ooni/nftables/tcp/9000.nft + create: yes + block: | + add rule inet filter input ip saddr 5.9.112.244 tcp dport 9000 counter accept comment "clickhouse from monitoring.ooni.org" + notify: reload nftables + +- name: Run clickhouse + when: inventory_hostname in ('backend-fsn.ooni.org', 'backend-hel.ooni.org') + tags: clickhouse + systemd: + name: clickhouse-server.service + state: started + enabled: yes + +## Clickhouse access control ## +# https://clickhouse.com/docs/en/operations/access-rights/#enabling-access-control + +- name: Clickhouse - test admin user - failure is ok to ignore + when: inventory_hostname in ('backend-fsn.ooni.org', 'backend-hel.ooni.org') + tags: clickhouse-users + command: clickhouse-client -u admin --password admin -q 'select 1' + ignore_errors: true + register: admin_check + +- name: install tor python3-lxml + when: admin_check is defined and admin_check is failed + tags: clickhouse-users + apt: + cache_valid_time: 86400 + name: python3-lxml + +- name: Clickhouse - set flag + when: admin_check is defined and admin_check is failed + tags: clickhouse-users + # The users.xml file itself needs to be edited for this to work + xml: + path: /etc/clickhouse-server/users.xml + backup: yes + xpath: /clickhouse/users/default/{{ item }} + value: "1" + loop: + - access_management + - named_collection_control + - show_named_collections + - show_named_collections_secrets + register: users_xml + +- name: Clickhouse - restart immediately if needed + when: admin_check is defined and admin_check is failed + tags: clickhouse-users + systemd: + name: clickhouse-server + state: restarted + +- name: Clickhouse - create admin + when: admin_check is defined and admin_check is failed + tags: clickhouse-users + command: clickhouse-client -q "CREATE USER OR REPLACE admin IDENTIFIED WITH sha256_password BY 'admin' HOST LOCAL GRANTEES ANY" + # The server might be still starting: retry as needed + retries: 10 + delay: 5 + register: result + until: result.rc == 0 + +- name: Clickhouse - grant admin rights + when: admin_check is defined and admin_check is failed + tags: clickhouse-users + command: clickhouse-client -q 'GRANT ALL ON *.* TO admin WITH GRANT OPTION' + +- name: Clickhouse - create readonly profile + when: admin_check is defined and admin_check is failed + tags: clickhouse-users + template: + src: clickhouse_readonly.xml + dest: /etc/clickhouse-server/users.d/make_default_readonly.xml + owner: clickhouse + group: clickhouse + mode: 0640 + + #- name: Clickhouse - restore users.xml + # when: admin_check is defined and admin_check is failed + # tags: clickhouse-users + # command: mv {{ users_xml.backup_file }} /etc/clickhouse-server/users.xml + +- name: Clickhouse - restart immediately if needed + when: admin_check is defined and admin_check is failed + tags: clickhouse-users + systemd: + name: clickhouse-server + state: restarted + +- name: Clickhouse - setup users and permissions + tags: clickhouse-users + command: clickhouse-client -u admin --password admin -q "{{ item }}" + loop: + - "CREATE USER OR REPLACE api IDENTIFIED WITH sha256_password BY 'api' HOST LOCAL" + - "GRANT ALL ON *.* TO api" + - "CREATE USER OR REPLACE fastpath IDENTIFIED WITH sha256_password BY 'fastpath' HOST LOCAL" + - "GRANT ALL ON *.* TO fastpath" + +## end of Clickhouse access control ## + + + +- name: Run feeder on backend-hel + when: inventory_hostname == 'backend-hel.ooni.org' + tags: clickhouse + blockinfile: + path: /etc/ooni/clickhouse_feeder.conf + create: yes + block: | + [DEFAULT] + pg_dbuser = readonly + pg_dbhost = localhost + +- name: run feeder on backend-fsn + when: inventory_hostname == 'backend-fsn.ooni.org' + tags: clickhouse + blockinfile: + path: /etc/ooni/clickhouse_feeder.conf + create: yes + block: | + [DEFAULT] + pg_dbuser = readonly + pg_dbhost = backend-hel.ooni.org + +- name: Run feeder + when: inventory_hostname in ('backend-fsn.ooni.org', 'backend-hel.ooni.org') + tags: clickhouse + systemd: + name: ooni-clickhouse-feeder.service + state: started + enabled: yes + +- name: Run DB backup on backend-hel + when: inventory_hostname == 'backend-hel.ooni.org' + tags: dbbackup + template: + src: db-backup.conf + dest: /etc/ooni/db-backup.conf + mode: 0600 + vars: + public_bucket_name: ooni-data-eu-fra-test + +- name: Run DB backup on FSN + when: inventory_hostname == 'backend-fsn.ooni.org' + tags: dbbackup + template: + src: db-backup.conf + dest: /etc/ooni/db-backup.conf + mode: 0600 + vars: + public_bucket_name: ooni-data-eu-fra diff --git a/ansible/roles/ooni-backend/templates/444.nft b/ansible/roles/ooni-backend/templates/444.nft new file mode 100644 index 00000000..03f5106f --- /dev/null +++ b/ansible/roles/ooni-backend/templates/444.nft @@ -0,0 +1,2 @@ +# roles/ooni-backend/templates/444.nft +add rule inet filter input tcp dport 444 counter accept comment "incoming haproxy metrics" diff --git a/ansible/roles/ooni-backend/templates/analysis.conf b/ansible/roles/ooni-backend/templates/analysis.conf new file mode 100644 index 00000000..4df8a8ae --- /dev/null +++ b/ansible/roles/ooni-backend/templates/analysis.conf @@ -0,0 +1,9 @@ +# Managed by ansible, see roles/ooni-backend/tasks/main.yml +# [s3bucket] +# bucket_name = ooni-data-eu-fra-test +# aws_access_key_id = +# aws_secret_access_key = + +[backup] +# space separated +table_names = citizenlab fastpath jsonl diff --git a/ansible/roles/ooni-backend/templates/api-uploader.conf b/ansible/roles/ooni-backend/templates/api-uploader.conf new file mode 100644 index 00000000..2de0e399 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/api-uploader.conf @@ -0,0 +1,9 @@ +# OONI API measurement uploader - Python ini format +# Deployed by ansible, see roles/ooni-backend/templates/api-uploader.conf +[DEFAULT] +# arn:aws:iam::676739448697:user/ooni-pipeline, AWS: OONI Open Data +aws_access_key_id = AKIAJURD7T4DTN5JMJ5Q +aws_secret_access_key = {{ s3_ooni_open_data_access_key }} +bucket_name = {{ bucket_name }} +msmt_spool_dir = /var/lib/ooniapi/measurements +collector_id = {{ collector_id }} diff --git a/ansible/roles/ooni-backend/templates/api.conf b/ansible/roles/ooni-backend/templates/api.conf new file mode 100644 index 00000000..25d1d0c6 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/api.conf @@ -0,0 +1,60 @@ +# Deployed by ansible +# See ooni-backend/tasks/main.yml ooni-backend/templates/api.conf +# Syntax: treat it as a Python file, but only uppercase variables are used +COLLECTORS = {{ collectors }} +COLLECTOR_ID = {{ collector_id }} + +# Read-only database access +# The password is already made public +DATABASE_URI_RO = "{{ pg_uri }}" + +DATABASE_STATEMENT_TIMEOUT = 20 + +{% if clickhouse_url|length %} +USE_CLICKHOUSE = True +{% else %} +USE_CLICKHOUSE = False +{% endif %} + +CLICKHOUSE_URL = "{{ clickhouse_url }}" + + +BASE_URL = "{{ base_url }}" + +AUTOCLAVED_BASE_URL = "http://datacollector.infra.ooni.io/ooni-public/autoclaved/" +CENTRIFUGATION_BASE_URL = "http://datacollector.infra.ooni.io/ooni-public/centrifugation/" + +S3_ACCESS_KEY_ID = "AKIAJURD7T4DTN5JMJ5Q" +S3_BUCKET_NAME = "{{ bucket_name }}" +S3_SECRET_ACCESS_KEY = "CHANGEME" +S3_SESSION_TOKEN = "CHANGEME" +S3_ENDPOINT_URL = "CHANGEME" + +PSIPHON_CONFFILE = "/etc/ooni/psiphon_config.json" +TOR_TARGETS_CONFFILE = "/etc/ooni/tor_targets.json" + +JWT_ENCRYPTION_KEY = "{{ jwt_encryption_key }}" +ACCOUNT_ID_HASHING_KEY = "{{ account_id_hashing_key }}" + +SESSION_EXPIRY_DAYS = 180 +LOGIN_EXPIRY_DAYS = 365 + +# Registration email delivery +MAIL_SERVER = "mail.riseup.net" +MAIL_PORT = 465 +MAIL_USE_SSL = True +MAIL_USERNAME = "ooni-mailer" +MAIL_PASSWORD = "{{ mail_smtp_password }}" +MAIL_SOURCE_ADDRESS = "contact@ooni.org" +LOGIN_BASE_URL = "{{ login_base_url }}" + +GITHUB_WORKDIR = "/var/lib/ooniapi/citizenlab" +GITHUB_TOKEN = "{{ github_token }}" +GITHUB_USER = "ooni-bot" +GITHUB_ORIGIN_REPO = "{{ github_origin_repo }}" +GITHUB_PUSH_REPO = "{{ github_push_repo }}" + +# Measurement spool directory +MSMT_SPOOL_DIR = "/var/lib/ooniapi/measurements" +GEOIP_ASN_DB = "/var/lib/ooniapi/asn.mmdb" +GEOIP_CC_DB = "/var/lib/ooniapi/cc.mmdb" diff --git a/ansible/roles/ooni-backend/templates/api.gunicorn.py b/ansible/roles/ooni-backend/templates/api.gunicorn.py new file mode 100644 index 00000000..f86b6f67 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/api.gunicorn.py @@ -0,0 +1,12 @@ +# Gunicorn configuration file +# Managed by ansible, see roles/ooni-backend/tasks/main.yml +# and templates/api.gunicorn.py + +workers = 12 + +loglevel = "info" +proc_name = "ooni-api" +reuse_port = True +# Disabled statsd: https://github.com/benoitc/gunicorn/issues/2843 +#statsd_host = "127.0.0.1:8125" +#statsd_prefix = "ooni-api" diff --git a/ansible/roles/ooni-backend/templates/clickhouse_config.xml b/ansible/roles/ooni-backend/templates/clickhouse_config.xml new file mode 100644 index 00000000..548c2a81 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/clickhouse_config.xml @@ -0,0 +1,41 @@ + + + + + information + + +{% if inventory_hostname == 'backend-fsn.ooni.org' %} + production + 20100100100 + +{% else %} + {{ inventory_hostname.replace(".ooni.org", "") }} +{% endif %} + +{% if inventory_hostname == 'backend-hel.ooni.org' %} + 500100100 + 3100100100 +{% endif %} + + + 0.0.0.0 + + + + + + + + + /metrics + 9363 + true + true + true + true + + diff --git a/ansible/roles/ooni-backend/templates/clickhouse_readonly.xml b/ansible/roles/ooni-backend/templates/clickhouse_readonly.xml new file mode 100644 index 00000000..73645616 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/clickhouse_readonly.xml @@ -0,0 +1,9 @@ + + + + + + readonly + + + diff --git a/ansible/roles/ooni-backend/templates/clickhouse_users.xml b/ansible/roles/ooni-backend/templates/clickhouse_users.xml new file mode 100644 index 00000000..49fd011a --- /dev/null +++ b/ansible/roles/ooni-backend/templates/clickhouse_users.xml @@ -0,0 +1,31 @@ + + + + + + + 1 + + + + + + + readonly + + 0.0.0.0 + + + + + + {{ clickhouse_writer_password|hash('sha256') }} + + 127.0.0.1 + + + + + + + diff --git a/ansible/roles/ooni-backend/templates/db-backup.conf b/ansible/roles/ooni-backend/templates/db-backup.conf new file mode 100644 index 00000000..4302f0ec --- /dev/null +++ b/ansible/roles/ooni-backend/templates/db-backup.conf @@ -0,0 +1,17 @@ +{ + "ver": 0, + "action": "export", + "public_aws_access_key_id": "AKIAJURD7T4DTN5JMJ5Q", + "public_aws_secret_access_key": "{{ s3_ooni_open_data_access_key }}", + "public_bucket_name": "{{ public_bucket_name }}", + "clickhouse_url": "clickhouse://localhost/default", + "__description": "tables can be backed up as: ignore, full, incremental, partition", + "backup_tables": { + "citizenlab": "ignore", + "fastpath": "ignore", + "jsonl": "ignore", + "msmt_feedback": "ignore", + "test_helper_instances": "ignore", + "url_priorities": "ignore" + } +} diff --git a/ansible/roles/ooni-backend/templates/deb_ooni_org.nginx.conf b/ansible/roles/ooni-backend/templates/deb_ooni_org.nginx.conf new file mode 100644 index 00000000..c069fd55 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/deb_ooni_org.nginx.conf @@ -0,0 +1,64 @@ +# Managed by ansible, see roles/ooni-backend/tasks/main.yml + +# anonymize ipaddr +map $remote_addr $remote_addr_anon { + ~(?P\d+\.\d+\.\d+)\. $ip.0; + ~(?P[^:]+:[^:]+): $ip::; + default 0.0.0.0; +} + +# log anonymized ipaddr +log_format deb_ooni_org_logfmt '$remote_addr_anon [$time_local] ' + '"$request" $status snt:$body_bytes_sent rt:$request_time uprt:$upstream_response_time "$http_referer" "$http_user_agent"'; + +server { + listen 80; + server_name deb.ooni.org; + access_log syslog:server=unix:/dev/log,severity=info deb_ooni_org_logfmt; + error_log syslog:server=unix:/dev/log,severity=info; + gzip on; + resolver 127.0.0.1; + # Serve ACME challenge from disk + location ^~ /.well-known/acme-challenge { + alias /var/lib/dehydrated/acme-challenges; + } + location / { + proxy_pass https://ooni-deb.s3.eu-central-1.amazonaws.com/; + } +} + +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name deb.ooni.org; + access_log syslog:server=unix:/dev/log,severity=info deb_ooni_org_logfmt; + error_log syslog:server=unix:/dev/log,severity=info; + gzip on; + ssl_certificate /var/lib/dehydrated/certs/{{ inventory_hostname }}/fullchain.pem; + ssl_certificate_key /var/lib/dehydrated/certs/{{ inventory_hostname }}/privkey.pem; + ssl_trusted_certificate /var/lib/dehydrated/certs/{{ inventory_hostname }}/chain.pem; # for ssl_stapling_verify + + ssl_session_timeout 5m; + ssl_session_cache shared:MozSSL:30m; + ssl_session_tickets off; + + ssl_protocols TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header X-Frame-Options DENY always; + add_header X-Content-Type-Options nosniff always; + + # OCSP stapling + ssl_stapling on; + ssl_stapling_verify on; + + # verify chain of trust of OCSP response using Root CA and Intermediate certs + #ssl_trusted_certificate /path/to/root_CA_cert_plus_intermediates; + + resolver 127.0.0.1; + location / { + proxy_pass https://ooni-deb.s3.eu-central-1.amazonaws.com/; + } +} diff --git a/ansible/roles/ooni-backend/templates/dehydrated.config b/ansible/roles/ooni-backend/templates/dehydrated.config new file mode 100644 index 00000000..7a0293a2 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/dehydrated.config @@ -0,0 +1,7 @@ +# Deployed by ansible +# See roles/ooni-backend/templates/dehydrated.config +CONFIG_D=/etc/dehydrated/conf.d +BASEDIR=/var/lib/dehydrated +WELLKNOWN="${BASEDIR}/acme-challenges" +DOMAINS_TXT="/etc/dehydrated/domains.txt" +HOOK="/etc/dehydrated/haproxy_hook.sh" diff --git a/ansible/roles/ooni-backend/templates/dehydrated_haproxy_hook.sh b/ansible/roles/ooni-backend/templates/dehydrated_haproxy_hook.sh new file mode 100644 index 00000000..0e5b41f3 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/dehydrated_haproxy_hook.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Deployed by ansible +# See roles/ooni-backend/templates/dehydrated_haproxy_hook.sh +# +# Deploys chained privkey and certificates for haproxy +# Reloads haproxy as needed + +deploy_cert() { + local DOMAIN="${1}" KEYFILE="${2}" CERTFILE="${3}" FULLCHAINFILE="${4}" CHAINFILE="${5}" TIMESTAMP="${6}" + # Called once for each certificate + # /var/lib/dehydrated/certs/backend-hel.ooni.org/privkey.pem /var/lib/dehydrated/certs/backend-hel.ooni.org/cert.pem /var/lib/dehydrated/certs/backend-hel.ooni.org/fullchain.pem > /var/lib/dehydrated/certs/backend-hel.ooni.org/haproxy.pem + # cp "${KEYFILE}" "${FULLCHAINFILE}" /etc/nginx/ssl/; chown -R nginx: /etc/nginx/ssl + logger "deploy_cert hook reading ${KEYFILE} ${CERTFILE} ${FULLCHAINFILE}" + cat "${KEYFILE}" "${CERTFILE}" "${FULLCHAINFILE}" > "${KEYFILE}.haproxy" + logger "deploy_cert reloading haproxy" + systemctl reload haproxy.service +} + +HANDLER="$1"; shift +if [[ "${HANDLER}" =~ ^(deploy_cert)$ ]]; then + "$HANDLER" "$@" +fi diff --git a/ansible/roles/ooni-backend/templates/fastpath.conf b/ansible/roles/ooni-backend/templates/fastpath.conf new file mode 100644 index 00000000..031f49a0 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/fastpath.conf @@ -0,0 +1,15 @@ +# See roles/ooni-backend/tasks/main.yml +[DEFAULT] +collectors = localhost +{% if psql_uri is defined %} +# The password is already made public +db_uri = {{ psql_uri }} +{% else %} +db_uri = +{% endif %} +clickhouse_url = {{ clickhouse_url }} + +# S3 access credentials +# Currently unused +s3_access_key = +s3_secret_key = diff --git a/ansible/roles/ooni-backend/templates/haproxy.cfg b/ansible/roles/ooni-backend/templates/haproxy.cfg new file mode 100644 index 00000000..025a4fc2 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/haproxy.cfg @@ -0,0 +1,122 @@ +## Deployed by ansible, see roles/ooni-backend/templates/haproxy.cfg + +# Proxies to: +# - local nginx +# - remote test helpers +# See http://interactive.blockdiag.com/?compression=deflate&src=eJyFjjELwjAQhXd_xeFuEdpBEAURBwfBXSSk6ZkEr7mSZGgR_7tNXdoiuD2--7j3SmL1rKzU8FoAFEUOqz0Y2XhuuxSHICKLiCEKg9Sg3_bmSHHaujaxISRyuJ7hRrJEgh0slVTGOr28Txz2yvQvvYw44R617XGXMTubWU7HzXq26kfl8XISykgidBphVP-whLPuOtRRhIaZ_ogVlt8d7PVYDXkS3x_pgmPP + +global + log /dev/log local0 info alert + log /dev/log local1 notice alert + chroot /var/lib/haproxy + stats socket /run/haproxy/admin.sock mode 660 level admin + stats timeout 30s + user haproxy + group haproxy + daemon + + # Default SSL material locations + ca-base /etc/ssl/certs + crt-base /etc/ssl/private + + # See: https://ssl-config.mozilla.org/#server=haproxy&server-version=2.0.3&config=intermediate + ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384 + ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256 + ssl-default-bind-options ssl-min-ver TLSv1.2 no-tls-tickets + +defaults + log global + mode http + option httplog + option dontlognull + timeout connect 5000 + timeout client 50000 + timeout server 50000 + errorfile 400 /etc/haproxy/errors/400.http + errorfile 403 /etc/haproxy/errors/403.http + errorfile 408 /etc/haproxy/errors/408.http + errorfile 500 /etc/haproxy/errors/500.http + errorfile 502 /etc/haproxy/errors/502.http + errorfile 503 /etc/haproxy/errors/503.http + errorfile 504 /etc/haproxy/errors/504.http + + log-format "%[var(txn.src_ipaddr_masked)] %ft > %b > %s %TR/%Tw/%Tc/%Tr/%Ta %ST %B %CC %CS %tsc %ac/%fc/%bc/%sc/%rc %sq/%bq %hr %hs %{+Q}r" + +frontend haproxy_metrics + # Metrics exposed on TLS port 444 + # File generated by /etc/dehydrated/haproxy_hook.sh + bind :444 ssl crt /var/lib/dehydrated/certs/"{{ inventory_hostname }}"/privkey.pem.haproxy + + http-request set-var(txn.src_ipaddr_masked) src,ipmask(24,64) + + # /__haproxy_stats stats page + stats enable + stats uri /__haproxy_stats + stats refresh 5s + + # /__haproxy_prom_metrics prometheus metrics + http-request use-service prometheus-exporter if { path /__haproxy_prom_metrics } + + +frontend public_tls + # TLS on port 443 + # File generated by /etc/dehydrated/haproxy_hook.sh + bind :443 ssl crt /var/lib/dehydrated/certs/{{ inventory_hostname }}/privkey.pem.haproxy + + http-request set-var(txn.src_ipaddr_masked) src,ipmask(24,64) + + # test helpers + default_backend lb_test_helpers + + # deb.ooni.org + acl ACL_deb_ooni_org hdr(host) -i deb.ooni.org + use_backend deb_ooni_org if ACL_deb_ooni_org + + # Nginx + use_backend nginx if !{ path / } || !{ method POST } + + +frontend public_80 + # Forwarded to Nginx for ACME and deb.ooni.org + bind :80 + + http-request set-var(txn.src_ipaddr_masked) src,ipmask(24,64) + + # ACME + use_backend nginx if { path_beg /.well-known/acme-challenge } + + # deb.ooni.org + acl ACL_deb_ooni_org hdr(host) -i deb.ooni.org + use_backend deb_ooni_org if ACL_deb_ooni_org + + + +backend nginx + # Local Nginx is in front of the API and more. See diagram. + default-server check + option forwardfor + #option httpchk GET / + # forward to local nginx + server nginx localhost:17744 + + +backend lb_test_helpers + # Remote testn helpers + default-server check + option forwardfor + http-check send meth POST uri / hdr Content-Type application/json body "{}" + http-check send-state + http-check comment "TH POST with empty JSON" + + server th0 0.th.ooni.org:443 ssl verify none + server th1 1.th.ooni.org:443 ssl verify none + server th2 2.th.ooni.org:443 ssl verify none + server th3 3.th.ooni.org:443 ssl verify none + #option httpchk + + +backend deb_ooni_org + #default-server check + option forwardfor + server s3-ooni-deb ooni-deb.s3.eu-central-1.amazonaws.com ssl verify none + diff --git a/ansible/roles/ooni-backend/templates/nginx-api-ams-pg.conf b/ansible/roles/ooni-backend/templates/nginx-api-ams-pg.conf new file mode 100644 index 00000000..4e3cf934 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/nginx-api-ams-pg.conf @@ -0,0 +1,297 @@ +# Managed by ansible +# roles/ooni-backend/templates/nginx-api-ams-pg.conf + +# Use 2-level cache, 20MB of RAM + 5GB on disk, +proxy_cache_path /var/cache/nginx/ooni-api levels=1:2 keys_zone=apicache:100M + max_size=5g inactive=24h use_temp_path=off; + +# anonymize ipaddr +map $remote_addr $remote_addr_anon { + ~(?P\d+\.\d+\.\d+)\. $ip.0; + ~(?P[^:]+:[^:]+): $ip::; + default 0.0.0.0; +} + +# log anonymized ipaddr and caching status +log_format ooni_api_fmt '$remote_addr_anon $upstream_cache_status [$time_local] ' + '"$request" $status snt:$body_bytes_sent rt:$request_time uprt:$upstream_response_time "$http_referer" "$http_user_agent"'; + +server { + # TODO(bassosimone): we need support for cleartext HTTP to make sure that requests + # over Tor correctly land to the proper backend. We are listening on this custom port + # and we are configuring Tor such that it routes traffic to this port. + listen 127.0.0.1:17744; + + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name _; + access_log syslog:server=unix:/dev/log,tag=ooniapi,severity=info ooni_api_fmt; + error_log syslog:server=unix:/dev/log,tag=ooniapi,severity=info; + gzip on; + + # TODO: we could use different client_max_body_size and SSL configurations for probe service paths + # and everyhing else + client_max_body_size 200M; # for measurement POST + + ssl_certificate {{ certpath }}{{ inventory_hostname }}/fullchain.pem; + ssl_certificate_key {{ certpath }}{{ inventory_hostname }}/privkey.pem; + ssl_trusted_certificate {{ certpath }}{{ inventory_hostname }}/chain.pem; # for ssl_stapling_verify + + # Use the intermediate configuration to support legacy probes + # https://ssl-config.mozilla.org/#server=nginx&version=1.14.2&config=intermediate&openssl=1.1.1d&guideline=5.6 + ssl_session_timeout 5m; + ssl_session_cache shared:MozSSL:30m; + ssl_session_tickets off; + + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + + # HSTS (ngx_http_headers_module is required) (63072000 seconds) + add_header Strict-Transport-Security "max-age=63072000" always; + + # OCSP stapling + ssl_stapling on; + ssl_stapling_verify on; + + # verify chain of trust of OCSP response using Root CA and Intermediate certs + #ssl_trusted_certificate /path/to/root_CA_cert_plus_intermediates; + + resolver 127.0.0.1; + + # Registry + # Should match: + # - /api/v1/login + # - /api/v1/register + # - /api/v1/update + location ~^/api/v1/(login|register|update) { + proxy_http_version 1.1; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 900; + + proxy_pass https://registry.ooni.io:443; + } + + # Selectively route test-list/urls to the API + location ~^/api/v1/test-list/urls { + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_cache apicache; + proxy_cache_min_uses 1; + proxy_cache_lock on; + proxy_cache_lock_timeout 30; + proxy_cache_lock_age 30; + proxy_cache_use_stale error timeout invalid_header updating; + proxy_cache_methods HEAD GET; + # Cache only 200, 301, and 302 by default and for very short. + # Overridden by the API using the Expires header + proxy_cache_valid 200 301 302 10s; + proxy_cache_valid any 0; + add_header x-cache-status $upstream_cache_status; + add_header X-Cache-Status $upstream_cache_status; + } + + # Orchestrate + # Should match: + # - /api/v1/test-list + location ~^/api/v1/(test-list|urls) { + proxy_http_version 1.1; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 900; + + proxy_pass https://orchestrate.ooni.io:443; + } + + # Web Connectivity Test Helper + # Should match: + # - / + # - /status + # + # The fact that it responds to / means that we may have to differentiate + # via the Host record. + # TODO We should check if clients will respect a suffix added to by the + # bouncer in the returned field, otherwise new clients should use another + # form + location ~^/web-connectivity/(status) { + proxy_http_version 1.1; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 900; + + proxy_pass https://wcth.ooni.io; + } + + location /whoami { + return 200 "{{ inventory_hostname }}"; + } + + location /metrics { + return 200 ''; + } + + # Expose (only) Netdata badges + location ~ ^/netdata/badge { + rewrite ^/netdata/badge /api/v1/badge.svg break; + proxy_pass http://127.0.0.1:19999; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + + # Expose package version badges + location /package_badges { + root /var/www; + add_header Pragma "no-cache"; + add_header Cache-Control "no-store, no-cache, must-revalidate, post-check=0, pre-check=0"; + } + + # Temporary redirection to backend-FSN + location ~ ^/api/v1/(aggregation|measurements|raw_measurement|measurement_meta) { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location ~ ^/api/_/(asn_by_month|countries|countries_by_month|check_report_id|country_overview|global_overview|global_overview_by_month|im_networks|im_stats|network_stats) { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location ~ ^/api/_/(test_coverage|website_networks|website_stats|website_urls|vanilla_tor_stats|test_names) { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location = /api/_/circumvention_stats_by_country { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location = / { + # match "/" strictly, not as a prefix + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location ~ ^/static/ { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + # open and close reports, submit msmt + location ~ ^/report/ { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + + # Auth, URL sumbission, URL priorities + location ~ ^/api/v1/(url-submission|get_account_role|set_account_role|set_session_expunge|user_login|user_register|user_logout) { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location ~ ^/api/_/(url-priorities|account_metadata) { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + + location ~ ^/api/v1/(collectors|test-helpers|torsf_stats) { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location ~ ^/(robots.txt|files) { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location = /api/v1/test-list/tor-targets { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location = /api/v1/test-list/urls { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location = /bouncer/net-tests { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + location = /api/v1/test-list/psiphon-config { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + #location ~ ^/api/_/(test_names) { + # proxy_pass https://backend-fsn.ooni.org; + # proxy_set_header Host $host; + # proxy_set_header X-Real-IP $remote_addr; + #} + ## /files* tree + #location ~ ^/files { + # proxy_pass https://backend-fsn.ooni.org; + # proxy_set_header Host $host; + # proxy_set_header X-Real-IP $remote_addr; + #} + #location ~ ^/(health) { + # proxy_pass https://backend-fsn.ooni.org; + # proxy_set_header Host $host; + # proxy_set_header X-Real-IP $remote_addr; + #} + + # Temporary redirect + location = /api/v1/check-in { + proxy_pass https://backend-fsn.ooni.org; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + + # new API + location / { + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_cache apicache; + proxy_cache_min_uses 1; + proxy_cache_lock on; + proxy_cache_lock_timeout 30; + proxy_cache_lock_age 30; + proxy_cache_use_stale error timeout invalid_header updating; + proxy_cache_methods HEAD GET; + # Cache only 200, 301, and 302 by default and for very short. + # Overridden by the API using the Expires header + proxy_cache_valid 200 301 302 10s; + proxy_cache_valid any 0; + add_header x-cache-status $upstream_cache_status; + add_header X-Cache-Status $upstream_cache_status; + } + + # Expose the measurement spool directory + location /measurement_spool/ { + alias /var/lib/ooniapi/measurements/incoming/; + autoindex off; + sendfile on; + tcp_nopush on; + if_modified_since off; + expires off; + etag off; + + gzip_comp_level 6; + gzip_min_length 1240; + gzip_proxied any; + gzip_types *; + gzip_vary on; + } +} diff --git a/ansible/roles/ooni-backend/templates/nginx-api-fsn.conf b/ansible/roles/ooni-backend/templates/nginx-api-fsn.conf new file mode 100644 index 00000000..9d6e1451 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/nginx-api-fsn.conf @@ -0,0 +1,260 @@ +# Managed by ansible +# roles/ooni-backend/templates/nginx-api-fsn.conf + +# Use 2-level cache, 20MB of RAM + 5GB on disk, +proxy_cache_path /var/cache/nginx/ooni-api levels=1:2 keys_zone=apicache:100M + max_size=5g inactive=24h use_temp_path=off; + +# anonymize ipaddr +map $remote_addr $remote_addr_anon { + ~(?P\d+\.\d+\.\d+)\. $ip.0; + ~(?P[^:]+:[^:]+): $ip::; + default 0.0.0.0; +} + +# anonymize forwarded ipaddr +map $http_x_forwarded_for $remote_fwd_anon { + ~(?P\d+\.\d+\.\d+)\. $ip.0; + ~(?P[^:]+:[^:]+): $ip::; + default 0.0.0.0; +} + + +# log anonymized ipaddr and caching status +log_format ooni_api_fmt '$remote_addr_anon $remote_fwd_anon $upstream_cache_status [$time_local] ' + '"$request" $status snt:$body_bytes_sent rt:$request_time uprt:$upstream_response_time "$http_referer" "$http_user_agent"'; + +server { + # TODO(bassosimone): we need support for cleartext HTTP to make sure that requests + # over Tor correctly land to the proper backend. We are listening on this custom port + # and we are configuring Tor such that it routes traffic to this port. + listen 127.0.0.1:17744; + + listen 443 ssl http2 default_server; + listen [::]:443 ssl http2 default_server; + server_name _; + access_log syslog:server=unix:/dev/log,tag=ooniapi,severity=info ooni_api_fmt; + error_log syslog:server=unix:/dev/log,tag=ooniapi,severity=info; + gzip on; + gzip_types text/plain application/xml application/json; + + # TODO: we could use different client_max_body_size and SSL configurations for probe service paths + # and everyhing else + client_max_body_size 200M; # for measurement POST + + ssl_certificate {{ certpath }}{{ inventory_hostname }}/fullchain.pem; + ssl_certificate_key {{ certpath }}{{ inventory_hostname }}/privkey.pem; + ssl_trusted_certificate {{ certpath }}{{ inventory_hostname }}/chain.pem; # for ssl_stapling_verify + + # Use the intermediate configuration to support legacy probes + # https://ssl-config.mozilla.org/#server=nginx&version=1.14.2&config=intermediate&openssl=1.1.1d&guideline=5.6 + ssl_session_timeout 5m; + ssl_session_cache shared:MozSSL:30m; + ssl_session_tickets off; + + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header X-Frame-Options DENY always; + add_header X-Content-Type-Options nosniff always; + + # OCSP stapling + ssl_stapling on; + ssl_stapling_verify on; + + # verify chain of trust of OCSP response using Root CA and Intermediate certs + #ssl_trusted_certificate /path/to/root_CA_cert_plus_intermediates; + + resolver 127.0.0.1; + + # Registry + # Should match: + # - /api/v1/login + # - /api/v1/register + # - /api/v1/update + location ~^/api/v1/(login|register|update) { + proxy_http_version 1.1; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 900; + + proxy_pass https://registry.ooni.io:443; + } + + # Selectively route test-list/urls to the API + location ~^/api/v1/test-list/urls { + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_cache apicache; + proxy_cache_min_uses 1; + proxy_cache_lock on; + proxy_cache_lock_timeout 30; + proxy_cache_lock_age 30; + proxy_cache_use_stale error timeout invalid_header updating; + proxy_cache_methods HEAD GET; + # Cache only 200, 301, and 302 by default and for very short. + # Overridden by the API using the Expires header + proxy_cache_valid 200 301 302 10s; + proxy_cache_valid any 0; + add_header x-cache-status $upstream_cache_status; + add_header X-Cache-Status $upstream_cache_status; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header X-Frame-Options DENY always; + add_header X-Content-Type-Options nosniff always; + } + + # Orchestrate + # Should match: + # - /api/v1/test-list + location ~^/api/v1/(test-list|urls) { + proxy_http_version 1.1; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 900; + + proxy_pass https://orchestrate.ooni.io:443; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header X-Frame-Options DENY always; + add_header X-Content-Type-Options nosniff always; + + } + + # Web Connectivity Test Helper + # Should match: + # - / + # - /status + # + # The fact that it responds to / means that we may have to differentiate + # via the Host record. + # TODO We should check if clients will respect a suffix added to by the + # bouncer in the returned field, otherwise new clients should use another + # form + location ~^/web-connectivity/(status) { + proxy_http_version 1.1; + proxy_set_header Host $http_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 900; + + proxy_pass https://wcth.ooni.io; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header X-Frame-Options DENY always; + add_header X-Content-Type-Options nosniff always; + } + + location /whoami { + return 200 "{{ inventory_hostname }}"; + } + + location /metrics { + return 200 ''; + } + + # Expose event detector RSS/atom feeds + location ~ ^/detector { + root /var/lib; + default_type application/xml; + } + + # Expose (only) Netdata badges + location ~ ^/netdata/badge { + rewrite ^/netdata/badge /api/v1/badge.svg break; + proxy_pass http://127.0.0.1:19999; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + + # Expose package version badges + location /package_badges { + root /var/www; + add_header Pragma "no-cache"; + add_header Cache-Control "no-store, no-cache, must-revalidate, post-check=0, pre-check=0"; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header X-Frame-Options DENY always; + add_header X-Content-Type-Options nosniff always; + } + + # 2022-09-01 20:08 CEST temporarily block a bot scraping /files/download/* + location ~^/files/download/ { + return 301 https://explorer.ooni.org/; + } + + # new API + location / { + + # Protect /apidocs invoked with url= and/or urls= args + if ($uri ~ "^/apidocs") { set $block_apidocs X; } + if ($args ~ "url=" ) { set $block_apidocs "${block_apidocs}Y"; } + if ($args ~ "urls=" ) { set $block_apidocs "${block_apidocs}Y"; } + if ($block_apidocs ~ "XY") { return 403; } # nested "if" are not supported + + deny 216.244.66.0/24; # DotBot/1.2 + deny 114.119.128.0/19; # PetalBot + allow all; + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + + # match test-helper POST to / and forward traffic to a TH + if ($request_uri = "/") { set $forward_to_th "YE"; } + if ($request_method = POST) { set $forward_to_th "${forward_to_th}S"; } + if ($forward_to_th = "YES") { + proxy_pass https://0.th.ooni.org; + } + + set $external_remote_addr $remote_addr; + if ($remote_addr = "188.166.93.143") { + # If remote_addr is ams-pg-test trust the X-Real-IP header + set $external_remote_addr $http_x_real_ip; + } + if ($remote_addr = "142.93.237.101") { + # If remote_addr is ams-pg trust the X-Real-IP header + set $external_remote_addr $http_x_real_ip; + } + proxy_set_header X-Real-IP $external_remote_addr; + + proxy_cache apicache; + proxy_cache_min_uses 1; + proxy_cache_lock on; + proxy_cache_lock_timeout 30; + proxy_cache_lock_age 30; + proxy_cache_use_stale error timeout invalid_header updating; + proxy_cache_methods HEAD GET; + # Cache only 200, 301, and 302 by default and for very short. + # Overridden by the API using the Expires header + proxy_cache_valid 200 301 302 10s; + proxy_cache_valid any 0; + add_header x-cache-status $upstream_cache_status; + add_header X-Cache-Status $upstream_cache_status; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header X-Frame-Options DENY always; + add_header X-Content-Type-Options nosniff always; + } + + # Expose the measurement spool directory + location /measurement_spool/ { + alias /var/lib/ooniapi/measurements/incoming/; + autoindex off; + sendfile on; + tcp_nopush on; + if_modified_since off; + expires off; + etag off; + } +} + +# Used by Netdata to monitor Nginx +server { + listen 127.0.0.1:80; + server_name localhost; + location = /stub_status { + stub_status; + } +} diff --git a/ansible/roles/ooni-backend/templates/nginx-api-test.conf b/ansible/roles/ooni-backend/templates/nginx-api-test.conf new file mode 100644 index 00000000..092d40db --- /dev/null +++ b/ansible/roles/ooni-backend/templates/nginx-api-test.conf @@ -0,0 +1,157 @@ +# Managed by ansible +# roles/ooni-backend/templates/nginx-api-test.conf + +# Use 2-level cache, 20MB of RAM + 5GB on disk, +proxy_cache_path /var/cache/nginx/ooni-api levels=1:2 keys_zone=apicache:100M + max_size=5g inactive=24h use_temp_path=off; + +# anonymize ipaddr +map $remote_addr $remote_addr_anon { + ~(?P\d+\.\d+\.\d+)\. $ip.0; + ~(?P[^:]+:[^:]+): $ip::; + default 0.0.0.0; +} + +# anonymize forwarded ipaddr +map $http_x_forwarded_for $remote_fwd_anon { + ~(?P\d+\.\d+\.\d+)\. $ip.0; + ~(?P[^:]+:[^:]+): $ip::; + default 0.0.0.0; +} + + +# log anonymized ipaddr and caching status +log_format ooni_api_fmt '$remote_addr_anon $remote_fwd_anon $upstream_cache_status [$time_local] ' + '"$request" $status snt:$body_bytes_sent rt:$request_time uprt:$upstream_response_time "$http_referer" "$http_user_agent"'; + +server { + # TODO(bassosimone): we need support for cleartext HTTP to make sure that requests + # over Tor correctly land to the proper backend. We are listening on this custom port + # and we are configuring Tor such that it routes traffic to this port. + listen 127.0.0.1:17744; + server_name _; + access_log syslog:server=unix:/dev/log,tag=ooniapi,severity=info ooni_api_fmt; + error_log syslog:server=unix:/dev/log,tag=ooniapi,severity=info; + gzip on; + gzip_types text/plain application/xml application/json; + + # TODO: we could use different client_max_body_size and SSL configurations for probe service paths + # and everyhing else + client_max_body_size 200M; # for measurement POST + + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header X-Frame-Options DENY always; + add_header X-Content-Type-Options nosniff always; + + # use systemd-resolved + resolver 127.0.0.53; + + # Selectively route test-list/urls to the API + location ~^/api/v1/test-list/urls { + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_cache apicache; + proxy_cache_min_uses 1; + proxy_cache_lock on; + proxy_cache_lock_timeout 30; + proxy_cache_lock_age 30; + proxy_cache_use_stale error timeout invalid_header updating; + proxy_cache_methods HEAD GET; + # Cache only 200, 301, and 302 by default and for very short. + # Overridden by the API using the Expires header + proxy_cache_valid 200 301 302 10s; + proxy_cache_valid any 0; + add_header x-cache-status $upstream_cache_status; + add_header X-Cache-Status $upstream_cache_status; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header X-Frame-Options DENY always; + add_header X-Content-Type-Options nosniff always; + } + + location /whoami { + return 200 "{{ inventory_hostname }}"; + } + + # Serve ACME challenge from disk + location ^~ /.well-known/acme-challenge { + alias /var/lib/dehydrated/acme-challenges; + } + + # 2022-09-01 20:08 CEST temporarily block a bot scraping /files/download/* + location ~^/files/download/ { + return 301 https://explorer.ooni.org/; + } + + # new API + location / { + + # Protect /apidocs invoked with url= and/or urls= args + if ($uri ~ "^/apidocs") { set $block_apidocs X; } + if ($args ~ "url=" ) { set $block_apidocs "${block_apidocs}Y"; } + if ($args ~ "urls=" ) { set $block_apidocs "${block_apidocs}Y"; } + if ($block_apidocs ~ "XY") { return 403; } # nested "if" are not supported + + deny 216.244.66.0/24; # DotBot/1.2 + deny 114.119.128.0/19; # PetalBot + allow all; + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + + set $external_remote_addr $remote_addr; + if ($remote_addr = "188.166.93.143") { + # If remote_addr is ams-pg-test trust the X-Real-IP header + set $external_remote_addr $http_x_real_ip; + } + if ($remote_addr = "142.93.237.101") { + # If remote_addr is ams-pg trust the X-Real-IP header + set $external_remote_addr $http_x_real_ip; + } + proxy_set_header X-Real-IP $external_remote_addr; + + proxy_cache apicache; + proxy_cache_min_uses 1; + proxy_cache_lock on; + proxy_cache_lock_timeout 30; + proxy_cache_lock_age 30; + proxy_cache_use_stale error timeout invalid_header updating; + proxy_cache_methods HEAD GET; + # Cache only 200, 301, and 302 by default and for very short. + # Overridden by the API using the Expires header + proxy_cache_valid 200 301 302 10s; + proxy_cache_valid any 0; + add_header x-cache-status $upstream_cache_status; + add_header X-Cache-Status $upstream_cache_status; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header X-Frame-Options DENY always; + add_header X-Content-Type-Options nosniff always; + } + + # Expose the measurement spool directory + location /measurement_spool/ { + alias /var/lib/ooniapi/measurements/incoming/; + autoindex off; + sendfile on; + tcp_nopush on; + if_modified_since off; + expires off; + etag off; + } +} + +server { + # Forward deb.ooni.org to S3 + listen 17744; + server_name deb.ooni.org; + access_log syslog:server=unix:/dev/log,severity=info ooni_api_fmt; + error_log syslog:server=unix:/dev/log,severity=info; + gzip on; + resolver 127.0.0.53; + # Serve ACME challenge from disk + location ^~ /.well-known/acme-challenge { + alias /var/lib/dehydrated/acme-challenges; + } + location / { + proxy_pass https://ooni-deb.s3.eu-central-1.amazonaws.com/; + } +} diff --git a/ansible/roles/ooni-backend/templates/rotation_nginx_conf b/ansible/roles/ooni-backend/templates/rotation_nginx_conf new file mode 100644 index 00000000..63255e51 --- /dev/null +++ b/ansible/roles/ooni-backend/templates/rotation_nginx_conf @@ -0,0 +1,70 @@ +# Managed by ansible, see roles/ooni-backend/tasks/main.yml +# and roles/ooni-backend/templates/rotation_nginx_conf +# Deployed by rotation tool to the test-helper hosts +proxy_cache_path /var/cache/nginx levels=1:2 keys_zone=thcache:100M + max_size=5g inactive=24h use_temp_path=off; + +server { + listen 443 ssl http2; + listen [::]:443 ssl http2; + server_name _; + gzip on; + ssl_certificate /etc/ssl/private/th_fullchain.pem; + ssl_certificate_key /etc/ssl/private/th_privkey.pem; + ssl_session_timeout 5m; + ssl_session_cache shared:MozSSL:30m; + ssl_session_tickets off; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384; + ssl_prefer_server_ciphers off; + add_header Strict-Transport-Security "max-age=63072000" always; + ssl_stapling on; + ssl_stapling_verify on; + resolver 127.0.0.1; + # local test helper + location / { + proxy_set_header X-Forwarded-Proto $scheme; + proxy_read_timeout 900; + proxy_pass http://127.0.0.1:8080; + + proxy_cache thcache; + proxy_cache_min_uses 1; + proxy_cache_lock on; + proxy_cache_lock_timeout 30; + proxy_cache_lock_age 30; + proxy_cache_use_stale error timeout invalid_header updating; + # Cache POST without headers set by the test helper! + proxy_cache_methods POST; + proxy_cache_key "$request_uri|$request_body"; + proxy_cache_valid 200 10m; + proxy_cache_valid any 0; + add_header X-Cache-Status $upstream_cache_status; + + } +} + +# Used by Netdata to monitor Nginx +server { + listen 127.0.0.1:80; + server_name localhost; + + allow 5.9.112.244; # monitoring host + deny all; + + location = /stub_status { + stub_status; + } +} + +# Used by Prometheus to reach the TH +server { + listen 9001; + server_name localhost; + + allow 5.9.112.244; # monitoring host + deny all; + + location = /metrics { + proxy_pass http://127.0.0.1:9091; + } +} diff --git a/ansible/roles/ooni-backend/templates/rotation_setup.sh b/ansible/roles/ooni-backend/templates/rotation_setup.sh new file mode 100644 index 00000000..5706150c --- /dev/null +++ b/ansible/roles/ooni-backend/templates/rotation_setup.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# Managed by ansible, see roles/ooni-backend/tasks/main.yml +# +# Configure test-helper droplet +# This script is run remotely on newly spawned VM by https://github.com/ooni/backend/blob/master/analysis/rotation.py +# It runs as root and with CWD=/ +# +set -euo pipefail +exec 1>/var/log/vm_rotation_setup.log 2>&1 +echo > /etc/motd + +echo "Configuring APT" +echo "deb [trusted=yes] https://ooni-deb.s3.eu-central-1.amazonaws.com unstable main" > /etc/apt/sources.list.d/ooni.list +cat < /etc/apt/trusted.gpg.d/ooni.gpg +-----BEGIN PGP PUBLIC KEY BLOCK----- + +mDMEYGISFRYJKwYBBAHaRw8BAQdA4VxoR0gSsH56BbVqYdK9HNQ0Dj2YFVbvKIIZ +JKlaW920Mk9PTkkgcGFja2FnZSBzaWduaW5nIDxjb250YWN0QG9wZW5vYnNlcnZh +dG9yeS5vcmc+iJYEExYIAD4WIQS1oI8BeW5/UhhhtEk3LR/ycfLdUAUCYGISFQIb +AwUJJZgGAAULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAKCRA3LR/ycfLdUFk+AQCb +gsUQsAQGxUFvxk1XQ4RgEoh7wy2yTuK8ZCkSHJ0HWwD/f2OAjDigGq07uJPYw7Uo +Ih9+mJ/ubwiPMzUWF6RSdgu4OARgYhIVEgorBgEEAZdVAQUBAQdAx4p1KerwcIhX +HfM9LbN6Gi7z9j4/12JKYOvr0d0yC30DAQgHiH4EGBYIACYWIQS1oI8BeW5/Uhhh +tEk3LR/ycfLdUAUCYGISFQIbDAUJJZgGAAAKCRA3LR/ycfLdUL4cAQCs53fLphhy +6JMwVhRs02LXi1lntUtw1c+EMn6t7XNM6gD+PXpbgSZwoV3ZViLqr58o9fZQtV3s +oN7jfdbznrWVigE= +=PtYb +-----END PGP PUBLIC KEY BLOCK----- +EOF + +# Vector +cat < /etc/apt/trusted.gpg.d/vector.gpg +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2 + +mQENBF9gFZ0BCADETtIHM8y5ehMoyNiZcriK+tHXyKnbZCKtMCKcC4ll94/6pekQ +jKIPWg8OXojkCtwua/TsddtQmOhUxAUtv6K0jO8r6sJ8rezMhuNH8J8rMqWgzv9d +2+U7Z7GFgcP0OeD+KigtnR8uyp50suBmEDC8YytmmbESmG261Y38vZME0VvQ+CMy +Yi/FvKXBXugaiCtaz0a5jVE86qSZbKbuaTHGiLn05xjTqc4FfyP4fi4oT2r6GGyL +Bn5ob84OjXLQwfbZIIrNFR10BvL2SRLL0kKKVlMBBADodtkdwaTt0pGuyEJ+gVBz +629PZBtSrwVRU399jGSfsxoiLca9//c7OJzHABEBAAG0OkNsb3Vkc21pdGggUGFj +a2FnZSAodGltYmVyL3ZlY3RvcikgPHN1cHBvcnRAY2xvdWRzbWl0aC5pbz6JATcE +EwEIACEFAl9gFZ0CGy8FCwkIBwMFFQoJCAsFFgIDAQACHgECF4AACgkQNUPbLQor +xLhf6gf8DyfIpKjvEeW/O8lRUTpkiPKezJbb+udZboCXJKDD02Q9PE3hfEfQRr5X +muytL7YMPvzqBVuP3xV5CN3zvtiQQbZiDhstImVyd+t24pQTkjzkvy+A2yvUuIkE +RWxuey41f5FNj/7wdfJnHoU9uJ/lvsb7DLXw7FBMZFNBR6LED/d+b61zMzVvmFZA +gsrCGwr/jfySwnpShmKdJaMTHQx0qt2RfXwNm2V6i900tAuMUWnmUIz5/9vENPKm +0+31I43a/QgmIrKEePhwn2jfA1oRlYzdv+PbblSTfjTStem+GqQkj9bZsAuqVH8g +3vq0NvX0k2CLi/W9mTiSdHXFChI15A== +=k36w +-----END PGP PUBLIC KEY BLOCK----- +EOF + +echo "deb https://repositories.timber.io/public/vector/deb/debian bullseye main" > /etc/apt/sources.list.d/vector.list + +echo "Installing packages" +export DEBIAN_FRONTEND=noninteractive +apt-get update -q +apt-get purge -qy unattended-upgrades rsyslog +apt-get upgrade -qy +apt-get install -qy --no-install-recommends chrony netdata oohelperd netdata-plugins-python + +systemctl daemon-reload +systemctl restart systemd-journald.service +logger start +systemctl restart systemd-journald.service + +apt-get install -qy --no-install-recommends vector + +echo "Configuring Vector" +# The certs are copied over by rotation.py +cat > /etc/vector/vector.toml < /etc/netdata/netdata.conf < /var/run/rotation_setup_completed diff --git a/ansible/roles/ooni-backend/templates/tor_targets.json b/ansible/roles/ooni-backend/templates/tor_targets.json new file mode 100644 index 00000000..933c4ede --- /dev/null +++ b/ansible/roles/ooni-backend/templates/tor_targets.json @@ -0,0 +1,304 @@ +{ + "128.31.0.39:9101": { + "address": "128.31.0.39:9101", + "fingerprint": "9695DFC35FFEB861329B9F1AB04C46397020CE31", + "name": "moria1", + "protocol": "or_port_dirauth" + }, + "128.31.0.39:9131": { + "address": "128.31.0.39:9131", + "fingerprint": "9695DFC35FFEB861329B9F1AB04C46397020CE31", + "name": "moria1", + "protocol": "dir_port" + }, + "131.188.40.189:443": { + "address": "131.188.40.189:443", + "fingerprint": "F2044413DAC2E02E3D6BCF4735A19BCA1DE97281", + "name": "gabelmoo", + "protocol": "or_port_dirauth" + }, + "131.188.40.189:80": { + "address": "131.188.40.189:80", + "fingerprint": "F2044413DAC2E02E3D6BCF4735A19BCA1DE97281", + "name": "gabelmoo", + "protocol": "dir_port" + }, + "154.35.175.225:443": { + "address": "154.35.175.225:443", + "fingerprint": "CF6D0AAFB385BE71B8E111FC5CFF4B47923733BC", + "name": "Faravahar", + "protocol": "or_port_dirauth" + }, + "154.35.175.225:80": { + "address": "154.35.175.225:80", + "fingerprint": "CF6D0AAFB385BE71B8E111FC5CFF4B47923733BC", + "name": "Faravahar", + "protocol": "dir_port" + }, + "171.25.193.9:443": { + "address": "171.25.193.9:443", + "fingerprint": "BD6A829255CB08E66FBE7D3748363586E46B3810", + "name": "maatuska", + "protocol": "dir_port" + }, + "171.25.193.9:80": { + "address": "171.25.193.9:80", + "fingerprint": "BD6A829255CB08E66FBE7D3748363586E46B3810", + "name": "maatuska", + "protocol": "or_port_dirauth" + }, + "193.23.244.244:443": { + "address": "193.23.244.244:443", + "fingerprint": "7BE683E65D48141321C5ED92F075C55364AC7123", + "name": "dannenberg", + "protocol": "or_port_dirauth" + }, + "193.23.244.244:80": { + "address": "193.23.244.244:80", + "fingerprint": "7BE683E65D48141321C5ED92F075C55364AC7123", + "name": "dannenberg", + "protocol": "dir_port" + }, + "199.58.81.140:443": { + "address": "199.58.81.140:443", + "fingerprint": "74A910646BCEEFBCD2E874FC1DC997430F968145", + "name": "longclaw", + "protocol": "or_port_dirauth" + }, + "199.58.81.140:80": { + "address": "199.58.81.140:80", + "fingerprint": "74A910646BCEEFBCD2E874FC1DC997430F968145", + "name": "longclaw", + "protocol": "dir_port" + }, + "204.13.164.118:443": { + "address": "204.13.164.118:443", + "fingerprint": "24E2F139121D4394C54B5BCC368B3B411857C413", + "name": "bastet", + "protocol": "or_port_dirauth" + }, + "204.13.164.118:80": { + "address": "204.13.164.118:80", + "fingerprint": "24E2F139121D4394C54B5BCC368B3B411857C413", + "name": "bastet", + "protocol": "dir_port" + }, + "2d7292b5163fb7de5b24cd04032c93a2d4c454431de3a00b5a6d4a3309529e49": { + "address": "193.11.166.194:27020", + "fingerprint": "86AC7B8D430DAC4117E9F42C9EAED18133863AAF", + "params": { + "cert": [ + "0LDeJH4JzMDtkJJrFphJCiPqKx7loozKN7VNfuukMGfHO0Z8OGdzHVkhVAOfo1mUdv9cMg" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + }, + "3fa772a44e07856b4c70e958b2f6dc8a29450a823509d5dbbf8b884e7fb5bb9d": { + "address": "192.95.36.142:443", + "fingerprint": "CDF2E852BF539B82BD10E27E9115A31734E378C2", + "params": { + "cert": [ + "qUVQ0srL1JI/vO6V6m/24anYXiJD3QP2HgzUKQtQ7GRqqUvs7P+tG43RtAqdhLOALP7DJQ" + ], + "iat-mode": [ + "1" + ] + }, + "protocol": "obfs4" + }, + "45.66.33.45:443": { + "address": "45.66.33.45:443", + "fingerprint": "7EA6EAD6FD83083C538F44038BBFA077587DD755", + "name": "dizum", + "protocol": "or_port_dirauth" + }, + "45.66.33.45:80": { + "address": "45.66.33.45:80", + "fingerprint": "7EA6EAD6FD83083C538F44038BBFA077587DD755", + "name": "dizum", + "protocol": "dir_port" + }, + "49116bf72d336bb8724fd3a06a5afa7bbd4e7baef35fbcdb9a98d13e702270ad": { + "address": "146.57.248.225:22", + "fingerprint": "10A6CD36A537FCE513A322361547444B393989F0", + "params": { + "cert": [ + "K1gDtDAIcUfeLqbstggjIw2rtgIKqdIhUlHp82XRqNSq/mtAjp1BIC9vHKJ2FAEpGssTPw" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + }, + "4a330634c5d678887f0f7c299490af43a6ac9fa944a6cc2140ab264c9ec124a0": { + "address": "209.148.46.65:443", + "fingerprint": "74FAD13168806246602538555B5521A0383A1875", + "params": { + "cert": [ + "ssH+9rP8dG2NLDN2XuFw63hIO/9MNNinLmxQDpVa+7kTOa9/m+tGWT1SmSYpQ9uTBGa6Hw" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + }, + "548eebff71da6128321c3bc1c3ec12b5bfff277ef5cde32709a33e207b57f3e2": { + "address": "37.218.245.14:38224", + "fingerprint": "D9A82D2F9C2F65A18407B1D2B764F130847F8B5D", + "params": { + "cert": [ + "bjRaMrr1BRiAW8IE9U5z27fQaYgOhX1UCmOpg2pFpoMvo6ZgQMzLsaTzzQNTlm7hNcb+Sg" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + }, + "5aeb9e43b43fc8a809b8d25aae968395a5ceea0e677caaf56e1c0a2ba002f5b5": { + "address": "193.11.166.194:27015", + "fingerprint": "2D82C2E354D531A68469ADF7F878FA6060C6BACA", + "params": { + "cert": [ + "4TLQPJrTSaDffMK7Nbao6LC7G9OW/NHkUwIdjLSS3KYf0Nv4/nQiiI8dY2TcsQx01NniOg" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + }, + "66.111.2.131:9001": { + "address": "66.111.2.131:9001", + "fingerprint": "BA44A889E64B93FAA2B114E02C2A279A8555C533", + "name": "Serge", + "protocol": "or_port_dirauth" + }, + "66.111.2.131:9030": { + "address": "66.111.2.131:9030", + "fingerprint": "BA44A889E64B93FAA2B114E02C2A279A8555C533", + "name": "Serge", + "protocol": "dir_port" + }, + "662218447d396b9d4f01b585457d267735601fedbeb9a19b86b942f238fe4e7b": { + "address": "51.222.13.177:80", + "fingerprint": "5EDAC3B810E12B01F6FD8050D2FD3E277B289A08", + "params": { + "cert": [ + "2uplIpLQ0q9+0qMFrK5pkaYRDOe460LL9WHBvatgkuRr/SL31wBOEupaMMJ6koRE6Ld0ew" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + }, + "75fe96d641a078fee06529af376d7f8c92757596e48558d5d02baa1e10321d10": { + "address": "45.145.95.6:27015", + "fingerprint": "C5B7CD6946FF10C5B3E89691A7D3F2C122D2117C", + "params": { + "cert": [ + "TD7PbUO0/0k6xYHMPW3vJxICfkMZNdkRrb63Zhl5j9dW3iRGiCx0A7mPhe5T2EDzQ35+Zw" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + }, + "86.59.21.38:443": { + "address": "86.59.21.38:443", + "fingerprint": "847B1F850344D7876491A54892F904934E4EB85D", + "name": "tor26", + "protocol": "or_port_dirauth" + }, + "86.59.21.38:80": { + "address": "86.59.21.38:80", + "fingerprint": "847B1F850344D7876491A54892F904934E4EB85D", + "name": "tor26", + "protocol": "dir_port" + }, + "99e9adc8bba0d60982dbc655b5e8735d88ad788905c3713a39eff3224b617eeb": { + "address": "38.229.1.78:80", + "fingerprint": "C8CBDB2464FC9804A69531437BCF2BE31FDD2EE4", + "params": { + "cert": [ + "Hmyfd2ev46gGY7NoVxA9ngrPF2zCZtzskRTzoWXbxNkzeVnGFPWmrTtILRyqCTjHR+s9dg" + ], + "iat-mode": [ + "1" + ] + }, + "protocol": "obfs4" + }, + "9d735c6e70512123ab2c2fe966446b2345b352c512e9fb359f4b1673236e4d4a": { + "address": "38.229.33.83:80", + "fingerprint": "0BAC39417268B96B9F514E7F63FA6FBA1A788955", + "params": { + "cert": [ + "VwEFpk9F/UN9JED7XpG1XOjm/O8ZCXK80oPecgWnNDZDv5pdkhq1OpbAH0wNqOT6H6BmRQ" + ], + "iat-mode": [ + "1" + ] + }, + "protocol": "obfs4" + }, + "b7c0e3f183ad85a6686ec68344765cec57906b215e7b82a98a9ca013cb980efa": { + "address": "193.11.166.194:27025", + "fingerprint": "1AE2C08904527FEA90C4C4F8C1083EA59FBC6FAF", + "params": { + "cert": [ + "ItvYZzW5tn6v3G4UnQa6Qz04Npro6e81AP70YujmK/KXwDFPTs3aHXcHp4n8Vt6w/bv8cA" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + }, + "b8de51da541ced804840b1d8fd24d5ff1cfdf07eae673dae38c2bc2cce594ddd": { + "address": "85.31.186.26:443", + "fingerprint": "91A6354697E6B02A386312F68D82CF86824D3606", + "params": { + "cert": [ + "PBwr+S8JTVZo6MPdHnkTwXJPILWADLqfMGoVvhZClMq/Urndyd42BwX9YFJHZnBB3H0XCw" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + }, + "d2d6e34abeda851f7cd37138ffafcce992b2ccdb0f263eb90ab75d7adbd5eeba": { + "address": "85.31.186.98:443", + "fingerprint": "011F2599C0E9B27EE74B353155E244813763C3E5", + "params": { + "cert": [ + "ayq0XzCwhpdysn5o0EyDUbmSOx3X/oTEbzDMvczHOdBJKlvIdHHLJGkZARtT4dcBFArPPg" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + }, + "f855ba38d517d8589c16e1333ac23c6e516532cf036ab6f47b15030b40a3b6a6": { + "address": "[2a0c:4d80:42:702::1]:27015", + "fingerprint": "C5B7CD6946FF10C5B3E89691A7D3F2C122D2117C", + "params": { + "cert": [ + "TD7PbUO0/0k6xYHMPW3vJxICfkMZNdkRrb63Zhl5j9dW3iRGiCx0A7mPhe5T2EDzQ35+Zw" + ], + "iat-mode": [ + "0" + ] + }, + "protocol": "obfs4" + } +} \ No newline at end of file diff --git a/ansible/roles/oonidata/defaults/main.yml b/ansible/roles/oonidata/defaults/main.yml index c2b0d9d8..84694a5c 100644 --- a/ansible/roles/oonidata/defaults/main.yml +++ b/ansible/roles/oonidata/defaults/main.yml @@ -6,3 +6,25 @@ tls_cert_dir: /etc/letsencrypt/live admin_group_name: admin enable_oonipipeline_worker: true enable_jupyterhub: true +clickhouse_url: "clickhouse://localhost" +certbot_domains: + - "{{ inventory_hostname }}" +conda_forge_packages: + - seaborn + - dask + - statsmodels +conda_packages: + - pandas + - numpy + - altair + - bokeh +pip_packages: + - "-e 'git+https://github.com/ooni/data#egg=oonipipeline&subdirectory=oonipipeline'" + - "clickhouse-driver" + - pomegranate + - pgmpy +apt_packages: + - net-tools + - curl + - git + - socat diff --git a/ansible/roles/oonidata/handlers/main.yml b/ansible/roles/oonidata/handlers/main.yml index f12d0aa6..df50cce8 100644 --- a/ansible/roles/oonidata/handlers/main.yml +++ b/ansible/roles/oonidata/handlers/main.yml @@ -10,6 +10,12 @@ state: restarted daemon_reload: true +- name: Restart oonidata-proxy + ansible.builtin.systemd_service: + name: oonidata-proxy + state: restarted + daemon_reload: true + - name: Reload nginx ansible.builtin.systemd_service: name: nginx diff --git a/ansible/roles/oonidata/meta/requirements.yml b/ansible/roles/oonidata/meta/requirements.yml new file mode 100644 index 00000000..0c765e0c --- /dev/null +++ b/ansible/roles/oonidata/meta/requirements.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - geerlingguy.certbot diff --git a/ansible/roles/oonidata/tasks/jupyterhub.yml b/ansible/roles/oonidata/tasks/jupyterhub.yml index b6fa2f07..7502668b 100644 --- a/ansible/roles/oonidata/tasks/jupyterhub.yml +++ b/ansible/roles/oonidata/tasks/jupyterhub.yml @@ -2,13 +2,17 @@ - name: Install jupyterhub ansible.builtin.shell: cmd: "{{ miniconda_install_dir }}/bin/conda install -c conda-forge -y jupyterhub" + become_user: miniconda tags: + - oonidata - jupyterhub - name: Install jupyterlab and notebook ansible.builtin.shell: cmd: "{{ miniconda_install_dir }}/bin/conda install -y jupyterlab notebook" + become_user: miniconda tags: + - oonidata - jupyterhub - name: Install jupyterhub packages @@ -16,12 +20,14 @@ name: - npm tags: + - oonidata - jupyterhub - name: Install configurable-http-proxy ansible.builtin.shell: cmd: "npm install -g configurable-http-proxy" tags: + - oonidata - jupyterhub - name: Create jupyterhub directories @@ -33,6 +39,7 @@ - "{{ jupyterhub_runtime_dir }}" - "{{ jupyterhub_runtime_dir }}/state" tags: + - oonidata - jupyterhub - name: Write jupyterhub config @@ -44,6 +51,7 @@ notify: - Restart jupyterhub tags: + - oonidata - jupyterhub - config @@ -57,6 +65,7 @@ notify: - Restart jupyterhub tags: + - oonidata - jupyterhub - name: Ensure the JupyterHub service is started with daemon-reload @@ -66,6 +75,7 @@ enabled: true daemon_reload: true tags: + - oonidata - jupyterhub - config @@ -78,5 +88,26 @@ notify: - Reload nginx tags: + - oonidata - jupyterhub - - config \ No newline at end of file + - config + +- ansible.builtin.include_role: + name: nginx + tags: + - oonidata + - nginx + +- ansible.builtin.include_role: + name: geerlingguy.certbot + tags: + - oonidata + - certbot + vars: + certbot_admin_email: admin@ooni.org + certbot_create_extra_args: "" + certbot_create_if_missing: true + certbot_create_standalone_stop_services: + - nginx + certbot_certs: + - domains: "{{ certbot_domains }}" diff --git a/ansible/roles/oonidata/tasks/main.yml b/ansible/roles/oonidata/tasks/main.yml index fa19b49b..48f05bae 100644 --- a/ansible/roles/oonidata/tasks/main.yml +++ b/ansible/roles/oonidata/tasks/main.yml @@ -6,54 +6,64 @@ shell: /bin/false createhome: no tags: + - oonidata - oonipipeline - jupyterhub - ansible.builtin.include_role: name: miniconda tags: + - oonidata - conda - ansible.builtin.import_tasks: jupyterhub.yml when: enable_jupyterhub tags: + - oonidata - jupyterhub -- ansible.builtin.include_role: - name: nginx +- name: Install apt packages + ansible.builtin.apt: + name: "{{ apt_packages }}" tags: - - nginx - -- ansible.builtin.include_role: - name: geerlingguy.certbot - tags: - - certbot - vars: - certbot_admin_email: admin@ooni.org - certbot_create_extra_args: "" - certbot_create_if_missing: true - certbot_create_standalone_stop_services: - - nginx - certbot_certs: - - domains: - - "{{ inventory_hostname }}" + - oonidata + - oonipipeline + - packages -- name: Install oonipipeline requirements - ansible.builtin.apt: - name: - - net-tools - - curl - - git +- name: "install conda packages" + ansible.builtin.shell: + cmd: "{{ miniconda_install_dir }}/bin/conda install -y {{ item }}" + chdir: "{{ miniconda_install_dir }}" + loop: "{{ conda_packages }}" + become_user: miniconda tags: + - oonidata - oonipipeline + - packages + +- name: Install conda-forge packages + ansible.builtin.shell: + cmd: "{{ miniconda_install_dir }}/bin/conda install -c conda-forge -y {{ item }}" + chdir: "{{ miniconda_install_dir }}" + loop: "{{ conda_forge_packages }}" + become_user: miniconda + tags: + - oonidata + - packages -- name: Install OONI pipeline from pip +- name: "Install pip packages" ansible.builtin.shell: - cmd: "{{ miniconda_install_dir }}/bin/pip install -e 'git+https://github.com/ooni/data#egg=oonipipeline&subdirectory=oonipipeline'" + cmd: "{{ miniconda_install_dir }}/bin/pip install {{ item }}" + chdir: "{{ miniconda_install_dir }}" + loop: "{{ pip_packages }}" + become_user: miniconda tags: + - oonidata - oonipipeline + - packages - ansible.builtin.import_tasks: oonipipeline-worker.yml when: enable_oonipipeline_worker tags: + - oonidata - oonipipeline diff --git a/ansible/roles/oonidata/templates/oonipipeline-config.toml.j2 b/ansible/roles/oonidata/templates/oonipipeline-config.toml.j2 index a41dcb43..d9461cbf 100644 --- a/ansible/roles/oonidata/templates/oonipipeline-config.toml.j2 +++ b/ansible/roles/oonidata/templates/oonipipeline-config.toml.j2 @@ -3,5 +3,6 @@ temporal_namespace = "ooni-pipeline.uuhzf" temporal_tls_client_cert_path = "/etc/ooni/pipeline/ooni-pipeline.uuhzf.crt" temporal_tls_client_key_path = "/etc/ooni/pipeline/ooni-pipeline.uuhzf.key" clickhouse_write_batch_size = 30000 +clickhouse_url = "{{ clickhouse_url }}" prometheus_bind_address = "127.0.0.1:9998" -data_dir = "/srv/oonipipeline/data_dir" \ No newline at end of file +data_dir = "/srv/oonipipeline/data_dir" diff --git a/ansible/roles/oonidata_airflow/Readme.md b/ansible/roles/oonidata_airflow/Readme.md new file mode 100644 index 00000000..c43a8f20 --- /dev/null +++ b/ansible/roles/oonidata_airflow/Readme.md @@ -0,0 +1,25 @@ +## Airflow role deployment notes + +There are a few pieces that are dependencies to this role running properly that +you will have to do manually: + +* Setup the postgresql database and create the relevant DB and account. + +Be sure to give correct permissions to the airflow user. Here is a relevant snippet: +``` +CREATE DATABASE airflow +CREATE ROLE airflow WITH PASSWORD '' LOGIN; +GRANT ALL PRIVILEGES ON DATABASE airflow TO airflow; +GRANT ALL ON SCHEMA public TO airflow; +``` + +* For some reason the admin account creation is failing. This is likely a bug + in the upstream role. During the last deploy this was addressed by logging +into the host and running the create task manually: +``` +AIRFLOW_CONFIG=/etc/airflow/airflow.cfg AIRFLOW_HOME=/opt/airflow/ /opt/airflow/bin/airflow users create --username admin --password XXX --firstname Open --lastname Observatory --role Admin --email admin@ooni.org +``` + +* Once the setup is complete, you will then have to login to the host using the + admin user and go into Admin->Configuration and add the `clickhouse_url` +variable diff --git a/ansible/roles/oonidata_airflow/defaults/main.yml b/ansible/roles/oonidata_airflow/defaults/main.yml new file mode 100644 index 00000000..c422ed2b --- /dev/null +++ b/ansible/roles/oonidata_airflow/defaults/main.yml @@ -0,0 +1,2 @@ +tls_cert_dir: /var/lib/dehydrated/certs +certbot_domains_extra: [] diff --git a/ansible/roles/oonidata_airflow/handlers/main.yml b/ansible/roles/oonidata_airflow/handlers/main.yml new file mode 100644 index 00000000..f6dda47d --- /dev/null +++ b/ansible/roles/oonidata_airflow/handlers/main.yml @@ -0,0 +1,4 @@ +- name: Reload nginx + ansible.builtin.systemd_service: + name: nginx + state: reloaded diff --git a/ansible/roles/oonidata_airflow/tasks/main.yml b/ansible/roles/oonidata_airflow/tasks/main.yml new file mode 100644 index 00000000..625ed6b2 --- /dev/null +++ b/ansible/roles/oonidata_airflow/tasks/main.yml @@ -0,0 +1,89 @@ +- name: Ensure Airflow group + group: + name: "airflow" + become: true + +# TODO: uncomment this section if you want to redeploy it +# this was added after the user had already been created by the airflow_role +# and so it's failing because it's trying to modify the user. +#- name: Ensure Airflow user +# user: +# name: "airflow" +# group: "airflow" +# system: true +# shell: "/usr/sbin/nologin" +# createhome: "yes" +# home: "/opt/airflow" +# become: true + +- name: Checkout oonidata repo + become_user: airflow + ansible.builtin.git: + repo: 'https://github.com/ooni/data.git' + dest: /opt/airflow/oonidata + version: main + +- ansible.builtin.include_role: + name: ooni.airflow_role + tags: + - oonidata + - airflow + vars: + airflow_app_home: /opt/airflow + airflow_dags_folder: /opt/airflow/oonidata/dags/ + airflow_webserver_host: "127.0.0.1" + airflow_webserver_port: 8080 + airflow_webserver_base_url: "https://{{ airflow_public_fqdn }}" + airflow_environment_extra_vars: + - name: AIRFLOW_VAR_DATA_DIR + value: "{{ airflow_app_home }}/data_dir" + airflow_extra_packages: + - postgres + - virtualenv + airflow_services: + airflow_webserver: + service_name: airflow-webserver + enabled: true + running: true + state: started + path: airflow-webserver.service.j2 + airflow_scheduler: + service_name: airflow-scheduler + enabled: true + running: true + state: started + path: airflow-scheduler.service.j2 + +- name: Set correct permissions on oonidata repo dir + ansible.builtin.file: + path: /opt/oonidata + state: directory + mode: '0755' + owner: airflow + recurse: yes + +- ansible.builtin.include_role: + name: nginx + tags: + - oonidata + - nginx + +- ansible.builtin.include_role: + name: dehydrated + tags: + - oonidata + - dehydrated + vars: + ssl_domains: "{{ [ inventory_hostname ] + [ airflow_public_fqdn ] }}" + +- name: Setup airflow nginx config + ansible.builtin.template: + src: nginx-airflow.j2 + dest: /etc/nginx/sites-enabled/02-airflow + owner: root + mode: "0655" + notify: + - Reload nginx + tags: + - oonidata + - config diff --git a/ansible/roles/oonidata_airflow/templates/nginx-airflow.j2 b/ansible/roles/oonidata_airflow/templates/nginx-airflow.j2 new file mode 100644 index 00000000..6c3b3fec --- /dev/null +++ b/ansible/roles/oonidata_airflow/templates/nginx-airflow.j2 @@ -0,0 +1,40 @@ +# ansible-managed in ooni/devops.git + +map $http_upgrade $connection_upgrade { + default upgrade; + '' close; +} + +server { + listen 443 ssl http2; + + include /etc/nginx/ssl_intermediate.conf; + + ssl_certificate {{ tls_cert_dir }}/{{ inventory_hostname }}/fullchain.pem; + ssl_certificate_key {{ tls_cert_dir }}/{{ inventory_hostname }}/privkey.pem; + ssl_trusted_certificate {{ tls_cert_dir }}/{{ inventory_hostname }}/chain.pem; + + server_name {{ airflow_public_fqdn }}; + access_log /var/log/nginx/{{ airflow_public_fqdn }}.access.log; + error_log /var/log/nginx/{{ airflow_public_fqdn }}.log warn; + + add_header Access-Control-Allow-Origin *; + + ## Airflow reverse proxy + location / { + proxy_pass http://127.0.0.1:8080; + + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + + client_max_body_size 100M; + + # WebSocket support + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + proxy_set_header X-Scheme $scheme; + proxy_buffering off; + } +} diff --git a/ansible/roles/oonidata_clickhouse/defaults/main.yml b/ansible/roles/oonidata_clickhouse/defaults/main.yml new file mode 100644 index 00000000..e69de29b diff --git a/ansible/roles/oonidata_clickhouse/handlers/main.yml b/ansible/roles/oonidata_clickhouse/handlers/main.yml new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/ansible/roles/oonidata_clickhouse/handlers/main.yml @@ -0,0 +1 @@ + diff --git a/ansible/roles/oonidata_clickhouse/tasks/main.yml b/ansible/roles/oonidata_clickhouse/tasks/main.yml new file mode 100644 index 00000000..febc1884 --- /dev/null +++ b/ansible/roles/oonidata_clickhouse/tasks/main.yml @@ -0,0 +1,38 @@ +- ansible.builtin.include_role: + name: idealista.clickhouse_role + tags: + - oonidata + - clickhouse + +- name: Create nftables rule for clickhouse native port + set_fact: + ch_nft_rule: '{{ ch_nft_rule | default([]) + [''add rule inet filter input ip saddr '' + item.ip + '' tcp dport 9000 counter accept comment "incoming clickhouse from '' + item.fqdn + ''"''] }}' + loop: "{{ nftables_clickhouse_allow | rejectattr('fqdn', 'eq', inventory_hostname) | list }}" + +- name: Create nftables rule for clickhouse inter-server communication + set_fact: + is_nft_rule: '{{ is_nft_rule | default([]) + [''add rule inet filter input ip saddr '' + item.ip + '' tcp dport 9009 counter accept comment "incoming clickhouse from '' + item.fqdn + ''"''] }}' + loop: "{{ nftables_clickhouse_allow | rejectattr('fqdn', 'eq', inventory_hostname) | list }}" + +- name: Create nftables rule for zookeeper + set_fact: + zk_nft_rule: '{{ zk_nft_rule | default([]) + [''add rule inet filter input ip saddr '' + item.ip + '' tcp dport 9181 counter accept comment "incoming zookeeper from '' + item.fqdn + ''"''] }}' + loop: "{{ nftables_zookeeper_allow | rejectattr('fqdn', 'eq', inventory_hostname) | list }}" + +- name: Create nftables rule for raft port + set_fact: + raft_nft_rule: '{{ raft_nft_rule | default([]) + [''add rule inet filter input ip saddr '' + item.ip + '' tcp dport 9234 counter accept comment "incoming raft from '' + item.fqdn + ''"''] }}' + loop: "{{ nftables_zookeeper_allow | rejectattr('fqdn', 'eq', inventory_hostname) | list }}" + +- ansible.builtin.include_role: + name: nftables + vars: + nft_rules_tcp: + - name: 9000 + rules: "{{ ch_nft_rule }}" + - name: 9009 + rules: "{{ is_nft_rule }}" + - name: 9181 + rules: "{{ zk_nft_rule }}" + - name: 9234 + rules: "{{ raft_nft_rule }}" diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml index 4980743f..d933611e 100644 --- a/ansible/roles/prometheus/tasks/main.yml +++ b/ansible/roles/prometheus/tasks/main.yml @@ -11,7 +11,10 @@ - "alert_*.yml" notify: - reload prometheus - tags: prometheus + tags: + - monitoring + - prometheus + - config - name: Configure Prometheus template: @@ -23,7 +26,10 @@ validate: "/usr/bin/promtool check config %s" notify: - reload prometheus - tags: prometheus + tags: + - monitoring + - prometheus + - config vars: prometheus_metrics_password_dev: "{{ lookup('amazon.aws.aws_secret', 'oonidevops/ooni_services/prometheus_metrics_password', profile='oonidevops_user_dev') }}" prometheus_metrics_password_prod: "{{ lookup('amazon.aws.aws_secret', 'oonidevops/ooni_services/prometheus_metrics_password', profile='oonidevops_user_prod') }}" diff --git a/ansible/roles/prometheus/templates/prometheus.yml b/ansible/roles/prometheus/templates/prometheus.yml index e8f9cd30..bed0464e 100755 --- a/ansible/roles/prometheus/templates/prometheus.yml +++ b/ansible/roles/prometheus/templates/prometheus.yml @@ -151,6 +151,30 @@ scrape_configs: - targets: - backend-fsn.ooni.org:9363 + - job_name: 'clickhouse cluster' + scrape_interval: 5s + scheme: http + metrics_path: "/metrics/clickhouse" + basic_auth: + username: 'prom' + password: '{{ prometheus_metrics_password_prod }}' + static_configs: + - targets: + - data1.htz-fsn.prod.ooni.nu:9100 + - data3.htz-fsn.prod.ooni.nu:9100 + + - job_name: 'node new' + scrape_interval: 5s + scheme: http + metrics_path: "/metrics/node_exporter" + basic_auth: + username: 'prom' + password: '{{ prometheus_metrics_password_prod }}' + static_configs: + - targets: + - data1.htz-fsn.prod.ooni.nu:9100 + - data3.htz-fsn.prod.ooni.nu:9100 + # See ansible/roles/ooni-backend/tasks/main.yml for the scraping targets - job_name: 'haproxy' scrape_interval: 5s diff --git a/ansible/roles/prometheus/vars/main.yml b/ansible/roles/prometheus/vars/main.yml index 6550cd51..d8774c47 100644 --- a/ansible/roles/prometheus/vars/main.yml +++ b/ansible/roles/prometheus/vars/main.yml @@ -1,24 +1,20 @@ dom0_hosts: - ams-ps.ooni.nu - ams-slack-1.ooni.org - - ams-wcth2.ooni.nu - - ams-wcth3.ooni.nu - - amsmatomo.ooni.nu - - db-1.proteus.ooni.io - doams1-countly.ooni.nu - mia-echoth.ooni.nu - mia-httpth.ooni.nu blackbox_jobs: - - name: "ooni web_connectivity test helpers" - module: "ooni_web_connectivity_ok" - targets: - # - "https://a.web-connectivity.th.ooni.io/status" - - "https://wcth.ooni.io/status" - - "https://ams-wcth2.ooni.nu/status" - - "https://a.web-connectivity.th.ooni.io/status" # "https://ams-wcth3.ooni.nu/status" - # cloudfront - - "https://d33d1gs9kpq1c5.cloudfront.net/status" + # TODO add these records to the ALB config + #- name: "ooni web_connectivity test helpers" + # module: "ooni_web_connectivity_ok" + # targets: + # # - "https://a.web-connectivity.th.ooni.io/status" + # #- "https://wcth.ooni.io/status" + # #- "https://ams-wcth2.ooni.nu/status" + # #- "https://a.web-connectivity.th.ooni.io/status" # "https://ams-wcth3.ooni.nu/status" + # # cloudfront - name: "new test helpers" module: "new_test_helper_health" @@ -30,6 +26,7 @@ blackbox_jobs: - "https://4.th.ooni.org/" - "https://5.th.ooni.org/" - "https://6.th.ooni.org/" + - "https://d33d1gs9kpq1c5.cloudfront.net/status" - name: "ooni collector" module: "ooni_collector_ok" @@ -95,6 +92,12 @@ blackbox_jobs: - "https://api.ooni.io/api/_/global_overview" - "https://api.ooni.org/api/_/global_overview" + - name: "ooni API findings listing" + module: "https_2xx_json" + targets: + - "https://api.ooni.io/api/v1/incidents/search" + - "https://api.ooni.org/api/v1/incidents/search" + # Note: this always returns true by design - name: "OONI API check_report_id" module: "https_2xx_json" diff --git a/ansible/roles/prometheus_alertmanager/tasks/main.yml b/ansible/roles/prometheus_alertmanager/tasks/main.yml index 271ef7c3..ec8e6803 100644 --- a/ansible/roles/prometheus_alertmanager/tasks/main.yml +++ b/ansible/roles/prometheus_alertmanager/tasks/main.yml @@ -1,5 +1,8 @@ - name: Installs packages - tags: monitoring, alertmanager + tags: + - monitoring + - alertmanager + - config apt: install_recommends: no cache_valid_time: 86400 @@ -7,7 +10,10 @@ - prometheus-alertmanager - name: Configure Alertmanager templates - tags: monitoring, alertmanager + tags: + - monitoring + - alertmanager + - config notify: - reload alertmanager copy: @@ -20,14 +26,20 @@ - templates/*.tmpl - name: Configure Alertmanager - tags: alertmanager + tags: + - monitoring + - alertmanager + - config lineinfile: path: /etc/default/prometheus-alertmanager regexp: "^ARGS=" line: ARGS='--cluster.listen-address= --web.listen-address="127.0.0.1:9093" --web.external-url="https://grafana.ooni.org"' - name: Reload Alertmanager - tags: alertmanager + tags: + - monitoring + - alertmanager + - config notify: - reload alertmanager template: diff --git a/ansible/roles/prometheus_alertmanager/templates/alertmanager.yml b/ansible/roles/prometheus_alertmanager/templates/alertmanager.yml index d9914808..eb36cf18 100755 --- a/ansible/roles/prometheus_alertmanager/templates/alertmanager.yml +++ b/ansible/roles/prometheus_alertmanager/templates/alertmanager.yml @@ -58,7 +58,7 @@ inhibit_rules: receivers: - name: 'team-all' email_configs: -{% for u in ['arturo', 'simone'] %} +{% for u in ['arturo', 'mehul'] %} - to: '{{ u }}@openobservatory.org' send_resolved: true smarthost: {{ am_mx_openobservatory }}:25 @@ -71,7 +71,7 @@ receivers: - name: 'team-email' # no slack email_configs: -{% for u in ['arturo', 'simone'] %} +{% for u in ['arturo', 'mehul'] %} - to: '{{ u }}@openobservatory.org' send_resolved: true smarthost: {{ am_mx_openobservatory }}:25 diff --git a/ansible/roles/prometheus_blackbox_exporter/tasks/main.yml b/ansible/roles/prometheus_blackbox_exporter/tasks/main.yml index 58385d91..36d660fa 100644 --- a/ansible/roles/prometheus_blackbox_exporter/tasks/main.yml +++ b/ansible/roles/prometheus_blackbox_exporter/tasks/main.yml @@ -7,10 +7,16 @@ mode: 0644 notify: - restart blackbox_exporter - tags: blackbox_exporter + tags: + - monitoring + - blackbox_exporter + - config - name: Setcap command: setcap cap_net_raw=ep /usr/bin/prometheus-blackbox-exporter - tags: blackbox_exporter + tags: + - monitoring + - blackbox_exporter + - config notify: - restart blackbox_exporter diff --git a/ansible/roles/prometheus_node_exporter/defaults/main.yml b/ansible/roles/prometheus_node_exporter/defaults/main.yml new file mode 100644 index 00000000..3433498f --- /dev/null +++ b/ansible/roles/prometheus_node_exporter/defaults/main.yml @@ -0,0 +1,16 @@ +prometheus_nginx_proxy_config: + - location: /metrics/node_exporter + proxy_pass: http://127.0.0.1:8100/metrics + +node_exporter_version: '1.8.2' +node_exporter_arch: 'amd64' +node_exporter_download_url: https://github.com/prometheus/node_exporter/releases/download/v{{ node_exporter_version }}/node_exporter-{{ node_exporter_version }}.linux-{{ node_exporter_arch }}.tar.gz + +node_exporter_bin_path: /usr/local/bin/node_exporter +node_exporter_host: 'localhost' +node_exporter_port: 8100 +node_exporter_options: '' + +node_exporter_state: started +node_exporter_enabled: true +node_exporter_restart: on-failure diff --git a/ansible/roles/prometheus_node_exporter/handlers/main.yml b/ansible/roles/prometheus_node_exporter/handlers/main.yml index 8face3a3..4ec66003 100644 --- a/ansible/roles/prometheus_node_exporter/handlers/main.yml +++ b/ansible/roles/prometheus_node_exporter/handlers/main.yml @@ -14,8 +14,7 @@ name: nginx state: restarted -- name: Reload nftables - tags: nftables - ansible.builtin.systemd_service: - name: nftables - state: reloaded +- name: restart node_exporter + service: + name: node_exporter + state: restarted diff --git a/ansible/roles/prometheus_node_exporter/tasks/install.yml b/ansible/roles/prometheus_node_exporter/tasks/install.yml new file mode 100644 index 00000000..2ad7ccd7 --- /dev/null +++ b/ansible/roles/prometheus_node_exporter/tasks/install.yml @@ -0,0 +1,60 @@ +--- +- name: Check current node_exporter version. + command: "{{ node_exporter_bin_path }} --version" + failed_when: false + changed_when: false + register: node_exporter_version_check + +- name: Download and unarchive node_exporter into temporary location. + unarchive: + src: "{{ node_exporter_download_url }}" + dest: /tmp + remote_src: true + mode: 0755 + when: > + node_exporter_version_check.stdout is not defined + or node_exporter_version not in node_exporter_version_check.stdout + register: node_exporter_download_check + +- name: Move node_exporter binary into place. + copy: + src: "/tmp/node_exporter-{{ node_exporter_version }}.linux-{{ node_exporter_arch }}/node_exporter" + dest: "{{ node_exporter_bin_path }}" + mode: 0755 + remote_src: true + notify: restart node_exporter + when: > + node_exporter_download_check is changed + or node_exporter_version_check.stdout | length == 0 + +- name: Create node_exporter user. + user: + name: node_exporter + shell: /sbin/nologin + state: present + +- name: Copy the node_exporter systemd unit file. + template: + src: node_exporter.service.j2 + dest: /etc/systemd/system/node_exporter.service + mode: 0644 + register: node_exporter_service + +- name: Reload systemd daemon if unit file is changed. + systemd: + daemon_reload: true + notify: restart node_exporter + when: node_exporter_service is changed + +- name: Ensure node_exporter is running and enabled at boot. + service: + name: node_exporter + state: "{{ node_exporter_state }}" + enabled: "{{ node_exporter_enabled }}" + +- name: Verify node_exporter is responding to requests. + uri: + url: "http://{% if node_exporter_host !='' %}{{ node_exporter_host }}{% else %}localhost{% endif %}:{{ node_exporter_port }}/" + return_content: true + register: metrics_output + failed_when: "'Metrics' not in metrics_output.content" diff --git a/ansible/roles/prometheus_node_exporter/tasks/main.yml b/ansible/roles/prometheus_node_exporter/tasks/main.yml index d33fe013..cf9f8229 100644 --- a/ansible/roles/prometheus_node_exporter/tasks/main.yml +++ b/ansible/roles/prometheus_node_exporter/tasks/main.yml @@ -4,13 +4,7 @@ - nginx - node_exporter -- ansible.builtin.include_role: - name: geerlingguy.node_exporter - vars: - node_exporter_host: "localhost" - node_exporter_port: 8100 - tags: - - node_exporter +- include_tasks: install.yml - name: create ooni configuration directory ansible.builtin.file: @@ -18,7 +12,9 @@ state: directory owner: root tags: + - monitoring - node_exporter + - config - name: Add a user to a password file and ensure permissions are set community.general.htpasswd: @@ -26,12 +22,14 @@ name: prom password: "{{ prometheus_metrics_password }}" owner: root - group: www-data + group: nginx mode: 0640 tags: + - monitoring - node_exporter + - config -- name: Setup oonidata nginx config +- name: Setup prometheus nginx config ansible.builtin.template: src: nginx-prometheus.j2 dest: /etc/nginx/sites-enabled/01-prometheus @@ -39,18 +37,19 @@ notify: - Restart nginx tags: + - monitoring - node_exporter - config -- name: Allow prometheus monitoring - ansible.builtin.blockinfile: - path: /etc/ooni/nftables/tcp/9100.nft - create: yes - block: | - add rule inet filter input tcp dport 9100 counter accept comment "Incoming prometheus monitoring" - notify: - - Reload nftables +- ansible.builtin.include_role: + name: nftables + vars: + nft_rules_tcp: + - name: 9100 + rules: + - add rule inet filter input ip saddr 5.9.112.244 tcp dport 9100 counter accept comment "clickhouse prometheus from monitoring.ooni.org" tags: - - nftables + - monitoring - node_exporter - config + - nftables diff --git a/ansible/roles/prometheus_node_exporter/templates/nginx-prometheus.j2 b/ansible/roles/prometheus_node_exporter/templates/nginx-prometheus.j2 index 7d9fbab1..7e68c45c 100644 --- a/ansible/roles/prometheus_node_exporter/templates/nginx-prometheus.j2 +++ b/ansible/roles/prometheus_node_exporter/templates/nginx-prometheus.j2 @@ -7,14 +7,18 @@ server { access_log /var/log/nginx/{{ inventory_hostname }}.access.log; error_log /var/log/nginx/{{ inventory_hostname }}.log warn; - location /metrics { + {% for config in prometheus_nginx_proxy_config %} + + location {{ config['location'] }} { auth_basic "Administrator’s Area"; auth_basic_user_file /etc/ooni/prometheus_passwd; - proxy_pass http://127.0.0.1:8100; + proxy_pass {{ config['proxy_pass'] }}; proxy_set_header X-Real-IP $remote_addr; proxy_set_header Host $host; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; } -} \ No newline at end of file + + {% endfor %} +} diff --git a/ansible/roles/prometheus_node_exporter/templates/node_exporter.service.j2 b/ansible/roles/prometheus_node_exporter/templates/node_exporter.service.j2 new file mode 100644 index 00000000..42cb98cc --- /dev/null +++ b/ansible/roles/prometheus_node_exporter/templates/node_exporter.service.j2 @@ -0,0 +1,11 @@ +[Unit] +Description=NodeExporter + +[Service] +TimeoutStartSec=0 +User=node_exporter +ExecStart={{ node_exporter_bin_path }} --web.listen-address={{ node_exporter_host }}:{{ node_exporter_port }} {{ node_exporter_options }} +Restart={{ node_exporter_restart }} + +[Install] +WantedBy=multi-user.target diff --git a/ansible/roles/ssh_users/tasks/main.yml b/ansible/roles/ssh_users/tasks/main.yml index 0d994377..d67b7acb 100644 --- a/ansible/roles/ssh_users/tasks/main.yml +++ b/ansible/roles/ssh_users/tasks/main.yml @@ -51,20 +51,20 @@ mode: 0400 with_items: "{{ admin_usernames | union(non_admin_usernames) }}" +- name: configure sshd + include_role: + name: willshersystems.sshd + vars: + sshd_skip_defaults: false + sshd: + AllowUsers: "{{ admin_usernames | union(non_admin_usernames) | sort | join(' ') }}" -- name: kill processes running as deactivated users - ansible.builtin.shell: - cmd: "pkill -U {{ item }}" - ignore_errors: true - with_items: "{{ deactivated_usernames }}" - -- name: remove any stale users - user: - name: "{{ item }}" - state: "absent" - remove: yes - force: yes - with_items: "{{ deactivated_usernames }}" +- name: Enesure sudoers dir exists + ansible.builtin.file: + path: /etc/sudoers.d + state: directory + owner: root + group: root - name: sudoers.d/80-admins template: @@ -80,10 +80,7 @@ path: /etc/sudoers.d/adm state: absent -- name: configure sshd - include_role: - name: willshersystems.sshd - vars: - sshd_skip_defaults: false - sshd: - AllowUsers: "{{ admin_usernames | union(non_admin_usernames) | sort | join(' ') }}" +- name: reload sshd + ansible.builtin.systemd_service: + name: sshd + state: reloaded diff --git a/ansible/roles/tailnet/tasks/main.yml b/ansible/roles/tailnet/tasks/main.yml new file mode 100644 index 00000000..86bc4b3d --- /dev/null +++ b/ansible/roles/tailnet/tasks/main.yml @@ -0,0 +1,4 @@ +- ansible.builtin.include_role: + name: artis3n.tailscale + tags: + - tailnet diff --git a/docs/DebianPackages.md b/docs/DebianPackages.md new file mode 100644 index 00000000..53be5af3 --- /dev/null +++ b/docs/DebianPackages.md @@ -0,0 +1,30 @@ +# Debian packages + +**NOTE** The direction we are going with the new backend is that of dropping debian packaging of all backend API components and move to a dockerized deployment approach. + +This section lists the Debian packages used to deploy backend +components. They are built by [GitHub CI workflows](#github-ci-workflows) 💡 +and deployed using [The deployer tool](#the-deployer-tool) 🔧. See +[Debian package build and publish](#debian-package-build-and-publish) 💡. + + +#### ooni-api package +Debian package for the [API](#api) ⚙ + + +#### fastpath package +Debian package for the [Fastpath](#fastpath) ⚙ + + +#### detector package +Debian package for the +[Social media blocking event detector](#social-media-blocking-event-detector) ⚙ + + +#### analysis package +The `analysis` Debian package contains various tools and runs various of +systemd timers, see [Systemd timers](#systemd-timers) 💡. + + +#### Analysis deployment +See [Backend component deployment](#backend-component-deployment) 📒 diff --git a/docs/DeprecatedDocs.md b/docs/DeprecatedDocs.md new file mode 100644 index 00000000..113d91dc --- /dev/null +++ b/docs/DeprecatedDocs.md @@ -0,0 +1,141 @@ +## Test helper rotation runbook +This runbook provides hints to troubleshoot the rotation of test +helpers. In this scenario test helpers are not being rotated as expected +and their TLS certificates might be at risk of expiring. + +Steps: + +1. Review [Test helpers](#comp:test_helpers), [Test helper rotation](#comp:test_helper_rotation) and [Test helpers notebook](#test-helpers-notebook) 📔 + +2. Review the charts on [Test helpers dashboard](#test-helpers-dashboard) 📊. + Look at different timespans: + + a. The uptime of the test helpers should be staggered by a week + depending on [Test helper rotation](#test-helper-rotation) ⚙. + +3. A summary of the live and last rotated test helper can be obtained + with: + +```sql +SELECT rdn, dns_zone, name, region, draining_at FROM test_helper_instances ORDER BY name DESC LIMIT 8 +``` + +4. The rotation tool can be started manually. It will always pick the + oldest host for rotation. ⚠️ Due to the propagation time of changes + in the DNS rotating many test helpers too quickly can impact the + probes. + + a. Log on [backend-fsn.ooni.org](#backend-fsn.ooni.org) 🖥 + + b. Check the last run using + `sudo systemctl status ooni-rotation.timer` + + c. Review the logs using `sudo journalctl -u ooni-rotation` + + d. Run `sudo systemctl restart ooni-rotation` and monitor the logs. + +5. Review the charts on [Test helpers dashboard](#test-helpers-dashboard) 📊 + during and after the rotation. + + +### Test helpers failure runbook +This runbook presents a scenario where a test helper is causing probes +to fail their tests sporadically. It describes how to identify the +affected host and mitigate the issue but can also be used to investigate +other issues affecting the test helpers. + +It has been chosen because such kind of incidents can impact the quality +of measurements and can be relatively difficult to troubleshoot. + +For investigating glitches in the +[test helper rotation](#test-helper-rotation) ⚙ see +[test helper rotation runbook](#test-helper-rotation-runbook) 📒. + +In this scenario either an alert has been sent to the +[#ooni-bots](#topic:oonibots) [Slack](#slack) 🔧 channel by +the [test helper failure rate notebook](#test-helper-failure-rate-notebook) 📔 or something +else caused the investigation. +See [Alerting](#alerting) 💡 for details. + +Steps: + +1. Review [Test helpers](#test-helpers) ⚙ + +2. Review the charts on [Test helpers dashboard](#test-helpers-dashboard) 📊. + Look at different timespans: + + a. The uptime of the test helpers should be staggered by a week + depending on [Test helper rotation](#test-helper-rotation) ⚙. + + b. The in-flight requests and requests per second should be + consistent across hosts, except for `0.th.ooni.org`. See + [Test helpers list](#test-helpers-list) 🐝 for details. + + c. Review CPU load, memory usage and run duration percentiles. + +3. Review [Test helper failure rate notebook](#test-helper-failure-rate-notebook) 📔 + +4. For more detailed investigation there is also a [test helper notebook](https://jupyter.ooni.org/notebooks/notebooks/2023%20%5Bfederico%5D%20test%20helper%20metadata%20in%20fastpath.ipynb) + +5. Log on the hosts using + `ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -Snone root@0.th.ooni.org` + +6. Run `journalctl --since '1 hour ago'` or review logs using the query + below. + +7. Run `top`, `strace`, `tcpdump` as needed. + +8. The rotation tool can be started at any time to rotate away failing + test helpers. The rotation script will always pick the oldest host + for rotation. ⚠️ Due to the propagation time of changes in the DNS + rotating many test helpers too quickly can impact the probes. + + a. Log on [backend-fsn.ooni.org](#backend-fsn.ooni.org) 🖥 + + b. Check the last run using + `sudo systemctl status ooni-rotation.timer` + + c. Review the logs using `sudo journalctl -u ooni-rotation` + + d. Run `sudo systemctl restart ooni-rotation` and monitor the logs. + +9. Review the charts on [Test helpers dashboard](#test-helpers-dashboard) 📊 + during and after the rotation. + +10. Summarize traffic hitting a test helper using the following commands: + + Top 10 miniooni probe IP addresses (Warning: this is sensitive data) + + `tail -n 100000 /var/log/nginx/access.log | grep miniooni | cut -d' ' -f1|sort|uniq -c|sort -nr|head` + + Similar, with anonimized IP addresses: + + `grep POST /var/log/nginx/access.log | grep miniooni | cut -d'.' -f1-3 | head -n 10000 |sort|uniq -c|sort -nr|head` + + Number of requests from miniooni probe in 10-minutes buckets: + + `grep POST /var/log/nginx/access.log | grep miniooni | cut -d' ' -f4 | cut -c1-17 | uniq -c` + + Number of requests from miniooni probe in 1-minute buckets: + + `grep POST /var/log/nginx/access.log | grep miniooni | cut -d' ' -f4 | cut -c1-18 | uniq -c` + + Number of requests grouped by hour, cache HIT/MISS/etc, software name and version + + `head -n 100000 /var/log/nginx/access.log | awk '{print $4, $6, $13}' | cut -c1-15,22- | sort | uniq -c | sort -n` + +To extract data from the centralized log database +on [monitoring.ooni.org](#monitoring.ooni.org) 🖥 you can use: + +``` sql +SELECT message FROM logs +WHERE SYSLOG_IDENTIFIER = 'oohelperd' +ORDER BY __REALTIME_TIMESTAMP DESC +LIMIT 10 +``` + +> **note** +> The table is indexed by `__REALTIME_TIMESTAMP`. Limiting the range by time can significantly increase query performance. + + +See [Selecting test helper for rotation](#selecting-test-helper-for-rotation) 🐞 diff --git a/docs/IncidentResponse.md b/docs/IncidentResponse.md new file mode 100644 index 00000000..bc12a048 --- /dev/null +++ b/docs/IncidentResponse.md @@ -0,0 +1,354 @@ +# Incident response + +## On-call preparation +Review [Alerting](#alerting) 💡 and check +[Grafana dashboards](#grafana-dashboards) 💡 + +On Android devices the following apps can be used: + + * [Slack](#slack) 🔧 app with audible notifications from the + #ooni-bots channel + + * [Grafana](#grafana) 🔧 viewer + + +## Severities + +When designing architecture of backend components or handling incidents it can be useful to have +defined severities and tiers. + +A set of guidelines are described at +This section presets a simplified approach to prioritizing incident response. + +In this case there is no distinction between severity and priority. Impact and response time are connected. + +Incidents and alarms from monitoring can be classified by severity levels based on their impact: + + - 1: Serious security breach or data loss; serious loss of privacy impacting users or team members; legal risks. + - 2: Downtime impacting service usability for a significant fraction of users or a tier 0 component; Serious security vulnerability. + Examples: probes being unable to submit measurements + - 3: Downtime or poor performance impacting secondary services (tier 1 or above); anything that can cause a level 2 event if not addressed within 24h; outages of monitoring infrastructure + - 4: Every other event that requires attention within 7 days + +For an outline of infrastructure tiers see [infrastructure tiers](devops/infrastructure). + +### Relations and dependencies between services + +Tiers are useful during design and deployment as a way to minimize risk of outages and avoid unexpected cascading failures. + +Having a low tier value should not be treated as a sign of "importance" for a component, but a liability. + +Pre-production deployment stages (e.g. testbed) have tier level >= 5 + +In this context a component can be a service as a whole, or a running process (daemon), a host, a hardware device, etc. +A component can contain other components. + +A component "A" is said to "hard depend" on another component "B" if an outage of B triggers an outage of A. + +It can also "soft depend" on another component if an outage of the latter triggers only a failure of a subsystem, or an ancillary feature or a reasonably short downtime. + +Regardless of tiers, components at a higher stage, (e.g. production) cannot depend and/or receive data from lower stages. The opposite is acceptable. + +Components can only hard-depend on other components at the same tier or with lower values. +E.g. a Tier 2 component can depend on a Tier 1 but not the other way around. +If it happens, the Tier 2 component should be immediatly re-classified as Tier 1 and treated accordingly (see below). + +E.g. anything that handles real-time failover for a service should be treated at the same tier (or lower value) as the service. + +Redundant components follow a special rule. For example, the "test helper" service provided to the probes, as a whole, should be considered tier 2 at least, +as it can impact all probes preventing them from running tests succesfully. +Yet, test helper processes and VMs can be considered tier 3 or even 4 if they sit behind a load balancer that can move traffic away from a failing host reliably +and with no significant downtime. + +Example: An active/standby database pair provides a tier 2 service. An automatic failover tool is triggered by a simple monitoring script. +Both have to be labeled tier 2. + +### Handling incidents + +Depending on the severity of an event a different workflow can be followed. + +An example of incident management workflow can be: + +| Severity | Response time | Requires conference call | Requires call leader | Requires postmortem | Sterile | +| -------- | ------- | ------ | -------- | ------- | ------ | +| 1 | 2h | Yes | Yes | Yes | Yes | +| 2 | 8h | Yes | No | Yes | Yes | +| 3 | 24h | No | No | No | Yes | +| 4 | 7d | No | No | No | No | + +The term "sterile" is named after - during the investigation the only priority should be to solve the issue at hand. +Other investigations, discussions, meetings should be postponed. + +When in doubt around the severity of an event, always err on the safe side. + +### Regular operations + +Based on the tier of a component, development and operation can follow different rules. + +An example of incident management workflow can be: + +| Tier | Require architecture review | Require code review | Require 3rd party security review | Require Change Management | +| -------- | ------- | ------ | -------- | ------- | +| 1 | Yes | Yes | Yes | Yes | +| 2 | Yes | Yes | No | No | +| 3 | No | Yes | No | No | +| 4 | No | No | No | No | + +"Change Management" refers to planning operational changes in advance and having team members review the change to be deployed in advance. + +E.g. scheduling a meeting to perform a probe release, have 2 people reviewing the metrics before and after the change. + + +## Redundant notifications +If needed, a secondary channel for alert notification can be set up +using + +Ntfy can host a push notification topic for free. + +For example is currently being used to +notify the outcome of CI runs from + + +An Android app is available: + + +[Grafana](#grafana) 🔧 can be configured to send alerts to ntfy.sh +using a webhook. + +### Measurement drop tutorial + +This tutorial provides examples on how to investigate a drop in measurements. +It is based on an incident where a drop in measurement was detected and the cause was not immediately clear. + +It is not meant to be a step-by-step runbook but rather give hints on what data to look for, how to generate charts and identify the root cause of an incident. + +A dedicated issue can be used to track the incident and the investigation effort and provide visibility: +https://github.com/ooni/sysadmin/blob/master/.github/ISSUE_TEMPLATE/incident.md +The issue can be filed during or after the incident depending on urgency. + +Some of the examples below come from +https://jupyter.ooni.org/notebooks/notebooks/android_probe_release_msm_drop_investigation.ipynb +During an investigation it can be good to create a dedicated Jupyter notebook. + +We started with reviewing: + + * + No issues detected as the charts show a short timespan. + * The charts on [Test helpers dashboard](#test-helpers-dashboard) 📊. + No issues detected here. + * The [API and fastpath](#api-and-fastpath) 📊 dashboard. + No issues detected here. + * The [Long term measurements prediction notebook](#long-term-measurements-prediction-notebook) 📔 + The decrease was clearly showing. + +Everything looked OK in terms of backend health. We then generated the following charts. + +The chunks of Python code below are meant to be run in +[Jupyter Notebook](#jupyter-notebook) 🔧 and are mostly "self-contained". +To be used you only need to import the +[Ooniutils microlibrary](#ooniutils-microlibrary) 💡: + +``` python +%run ooniutils.ipynb +``` + +The "t" label is commonly used on existing notebooks to refer to hour/day/week time slices. + +We want to plot how many measurements we are receiving from Ooniprobe Android in unattended runs, grouped by day and by `software_version`. + +The last line generates an area chart using Altair. Notice that the `x` and `y` and `color` parameters match the 3 columns extracted by the `SELECT`. + +The `GROUP BY` is performed on 2 of those 3 columns, while `COUNT(*)` is counting how many measurements exist in each t/software_version "bucket". + +The output of the SQL query is just a dataframe with 3 columns. There is no need to pivot or reindex it as Altair does the data transformation required. + +> **note** +> Altair refuses to process dataframes with more than 5000 rows. + +``` python +x = click_query(""" + SELECT + toStartOfDay(toStartOfWeek(measurement_start_time)) AS t, + software_version, + COUNT(*) AS msm_cnt + FROM fastpath + WHERE measurement_start_time > today() - interval 3 month + AND measurement_start_time < today() + AND software_name = 'ooniprobe-android-unattended' + GROUP BY t, software_version +""") +alt.Chart(x).mark_area().encode(x='t', y='msm_cnt', color='software_version').properties(width=1000, height=200, title="Android unattended msm cnt") +``` + +The generated chart was: + +![chart](../../../assets/images-backend/msm_drop_investigation_1.png) + +From the chart we concluded that the overall number of measurements have been decreasing since the release of a new version. +We also re-ran the plot by filtering on other `software_name` values and saw no other type of probe was affected. + +> **note** +> Due to a limitation in Altair, when grouping time by week use +> `toStartOfDay(toStartOfWeek(measurement_start_time)) AS t` + +Then we wanted to measure how many measurements are being collected during each `web_connectivity` test run. +This is to understand if probes are testing less measurements in each run. + +The following Python snippet uses nested SQL queries. The inner query groups measurements by time, `software_version` and `report_id`, +and counts how many measurements are related to each `report_id`. +The outer query "ignores" the `report_id` value and `quantile()` is used to extract the 50 percentile of `msm_cnt`. + +> **note** +> The use of double `%%` in `LIKE` is required to escape the `%` wildcard. The wildcard is used to match any amount of characters. + +``` python +x = click_query(""" + SELECT + t, + quantile(0.5)(msm_cnt) AS msm_cnt_p50, + software_version + FROM ( + SELECT + toStartOfDay(toStartOfWeek(measurement_start_time)) AS t, + software_version, + report_id, + COUNT(*) AS msm_cnt + FROM fastpath + WHERE measurement_start_time > today() - interval 3 month + AND test_name = 'web_connectivity' + AND measurement_start_time < today() + AND software_name = 'ooniprobe-android-unattended' + AND software_version LIKE '3.8%%' + GROUP BY t, software_version, report_id + ) GROUP BY t, software_version +""") +alt.Chart(x).mark_line().encode(x='t', y='msm_cnt_p50', color='software_version').properties(width=1000, height=200, title="Android unattended msmt count per report") +``` + +We also compared different version groups and different `software_name`. +The output shows that indeed the number of measurements for each run is significantly lower for the newly released versions. + +![chart](../../../assets/images-backend/msm_drop_investigation_4.png) + +To update the previous Python snippet to group measurements by a different field, change `software_version` into the new column name. +For example use `probe_cc` to show a chart with a breakdown by probe country name. You should change `software_version` once in each SELECT part, +then in the last two `GROUP BY`, and finally in the `color` line at the bottom. + +We did such change to confirm that all countries were impacted in the same way. (The output is not included here as not remarkable) + +Also, `mark_line` on the bottom line is used to create line charts. Switch it to `mark_area` to generate *stacked* area charts. +See the previous two charts as examples. + +We implemented a change to the API to improve logging the list of tests returned at check-in: +and reviewed monitored the logs using `sudo journalctl -f -u ooni-api`. + +The output showed that the API is very often returning 100 URLs to probes. + +We then ran a similar query to extract the test duration time by calculating +`MAX(measurement_start_time) - MIN(measurement_start_time) AS delta` for each `report_id` value: + +``` python +x = click_query(""" + SELECT t, quantile(0.5)(delta) AS deltaq, software_version + FROM ( + SELECT + toStartOfDay(toStartOfWeek(measurement_start_time)) AS t, + software_version, + report_id, + MAX(measurement_start_time) - MIN(measurement_start_time) AS delta + FROM fastpath + WHERE measurement_start_time > today() - interval 3 month + AND test_name = 'web_connectivity' + AND measurement_start_time < today() + AND software_name = 'ooniprobe-android-unattended' + AND software_version LIKE '3.8%%' + GROUP BY t, software_version, report_id + ) GROUP BY t, software_version +""") +alt.Chart(x).mark_line().encode(x='t', y='deltaq', color='software_version').properties(width=1000, height=200, title="Android unattended test run time") +``` + +![chart](../../../assets/images-backend/msm_drop_investigation_2.png) + +The chart showed that the tests are indeed running for a shorter amount of time. + +> **note** +> Percentiles can be more meaningful then averages. +> To calculate quantiles in ClickHouse use `quantile()()`. + +Example: + +``` sql +quantile(0.1)(delta) AS deltaq10 +``` + +Wondering if the slowdown was due to slower measurement execution or other issues, we also generated a table as follows. + +> **note** +> Showing color bars allows to visually inspect tables more quickly. Setting the axis value to `0`, `1` or `None` helps readability: +> `y.style.bar(axis=None)` + +Notice the `delta / msmcnt AS seconds_per_msm` calculation: + +``` python +y = click_query(""" + SELECT + quantile(0.1)(delta) AS deltaq10, + quantile(0.3)(delta) AS deltaq30, + quantile(0.5)(delta) AS deltaq50, + quantile(0.7)(delta) AS deltaq70, + quantile(0.9)(delta) AS deltaq90, + + quantile(0.5)(seconds_per_msm) AS seconds_per_msm_q50, + quantile(0.5)(msmcnt) AS msmcnt_q50, + + software_version, software_name + FROM ( + SELECT + software_version, software_name, + report_id, + MAX(measurement_start_time) - MIN(measurement_start_time) AS delta, + count(*) AS msmcnt, + delta / msmcnt AS seconds_per_msm + FROM fastpath + WHERE measurement_start_time > today() - interval 3 month + AND test_name = 'web_connectivity' + AND measurement_start_time < today() + AND software_name IN ['ooniprobe-android-unattended', 'ooniprobe-android'] + AND software_version LIKE '3.8%%' + GROUP BY software_version, report_id, software_name + ) GROUP BY software_version, software_name + ORDER by software_version, software_name ASC +""") +y.style.bar(axis=None) +``` + +![chart](../../../assets/images-backend/msm_drop_investigation_3.png) + +In the table we looked at the `seconds_per_msm_q50` column: the median time for running each test did not change significantly. + +To summarize: + * The backend appears to deliver the same amount of URLs to the Probes as usual. + * The time required to run each test is rougly the same. + * Both the number of measurements per run and the run time decreased in the new releases. + +## Github issues + +### Selecting test helper for rotation +See + + +### Document Tor targets +See + + +### Disable unnecessary ClickHouse system tables +See + + +### Feed fastpath from JSONL +See + + +### Implement Grafana dashboard and alarms backup +See diff --git a/docs/Infrastructure.md b/docs/Infrastructure.md new file mode 100644 index 00000000..4b51eb42 --- /dev/null +++ b/docs/Infrastructure.md @@ -0,0 +1,205 @@ +# Infrastructure + +Our infrastructure is primarily spread across the following providers: + +* Hetzner, for dedicated hosts +* DigitalOcean, for VPSs which require IPv6 support +* AWS, for most cloud based infrastrucutre hosting + +We manage the deployment and configuration of hosts through a combination of ansible and terraform. + +## Infrastructure Tiers + +We divide our infrastructure components into 3 tiers: + +- **Tier 0: Critical**: These are mission critical infrastructure components. If these become unavailable or have significant disruption, it will have a major impact. + +- **Tier 1: Essential**: These components are important, but not as critical as + tier 0. They are part of our core operations, but if they become unavailable + the impact is important, but not major. + +- **Tier 2: Non-Essential**: These are auxiliary components. Their + unavailability does not have a major impact. + +### Tier 0 (Critical) components + +- [ ] Probe Services (collector specifically) +- [ ] Fastpath (part responsible for storing post-cans) +- [x] DNS configuration +- [ ] OONI bridges +- [x] Web Connectivity test helpers + +### Tier 1 (Essential) components + +- [ ] OONI API measurement listing +- [x] OONI Explorer +- [x] OONI Run +- [ ] Monitoring +- [ ] OONI.org website +- [x] Code signing +- [ ] OONI Data analysis pipeline +- [x] OONI Findings API +- [x] Website analytics + +### Tier 2 (Non-Essential) components + +- [ ] Test list editor +- [ ] Jupyter notebooks +- [ ] Countly + +## Hosts + +This section provides a summary of the backend hosts described in the +rest of the document. + +A full list is available at + - +also see [Ansible](#ansible) 🔧 + +### backend-fsn.ooni.org + +Public-facing production backend host, receiving the deployment of the +packages: + +- [ooni-api](legacybackend/operations/#ooni-api-package) 📦 + +- [fastpath](legacybackend/operations/#fastpath-package) 📦 + +- [analysis](legacybackend/operations/#analysis-package) 📦 + +- [detector](legacybackend/operations/#detector-package) 📦 + +### backend-hel.ooni.org + +Standby / pre-production backend host. Runs the same software stack as +[backend-fsn.ooni.org](#backend-fsn.ooni.org) 🖥, plus the +[OONI bridges](#ooni-bridges) ⚙ + +### monitoring.ooni.org + +Runs the internal monitoring stack, including +[Jupyter Notebook](#tool:jupyter), [Prometheus](#prometheus) 🔧, +[Vector](#vector) 🔧 and +[ClickHouse instance for logs](#clickhouse-instance-for-logs) ⚙ + +## Etckeeper + +Etckeeper is deployed on backend +hosts and keeps the `/etc` directory under git version control. It +commits automatically on package deployment and on timed runs. It also +allows doing commits manually. + +To check for history of the /etc directory: + +```bash +sudo -i +cd /etc +git log --raw +``` + +And `git diff` for unmerged changes. + +Use `etckeeper commit ` to commit changes. + +:::tip +Etckeeper commits changes automatically when APT is used or on daily basis, whichever comes first. +::: + +## Devops credentials + +Credentials necessary for the deployment of backend infrastructure components should be stored inside of [AWS Systems Manager Parameter Store](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html). The same key name should be used in both production and development environment, but a different value shall be used across environments. + +:::note +We previously were using secrets manager, but are in the process of moving over all secerets to parameter store, see: https://github.com/ooni/devops/issues/114. + +Once this is complete this note can be removed. +::: + +## DNS and Domains + +The primary domains used by the backend are: +- `ooni.org` +- `ooni.io` +- `ooni.nu` + +DNS is managed inside of route53. Where a static configuration is needed, this is added to the terraform `tf/environments/prod/dns_records.tf` file. For records that are being populated as part of IaC deployments, those can be registerred and written directly using terraform itself. + +For the `ooni.io` and `ooni.nu` zones, we also have delegated two sub zones each one for the `dev` and one for the `prod` environment. This allows the dev environment to manage it's own zone, like the production environment would, but also properly compatmentalize it. + +This leads us to having the following zones: +* `ooni.org` root zone, managed in the prod environment +* `ooni.io` root zone, managed in the prod environment +* `ooni.nu` root zone, managed in the prod environment +* `prod.ooni.io` delegated zone, managed in the prod environment +* `prod.ooni.nu` delegated zone, managed in the prod environment +* `dev.ooni.io` delegated zone, managed in the dev environment +* `dev.ooni.nu` delegated zone, managed in the dev environment + +### DNS naming policy + +The public facing name of services, follows this format: + +- `.ooni.org` + +Examples: + +- `explorer.ooni.org` +- `run.ooni.org` + +Public-facing means the FQDNs are used directly by external users, services, or +embedded in the probes. They cannot be changed or retired without causing +outages. + +Use public facing names sparingly and when possible start off by creating a +private name first. +Not every host needs to have a public facing name. For example staging and +testing environments might not have a public facing name. + +Each service also has public name which points to the specific host running that +service, and these are hosted in the `.io` zone. +This is helpful because sometimes you might have the same host running multiple +services or you might also have multiple services behind the same public service +endpoint (eg. in the case of an API gateway setup). + +Name in the `.io` zone should always include also the environment name they are +related to: + +- `.prod.ooni.io` for production services +- `.test.ooni.io` for test services + +When there may be multiple instances of a service running, you can append a +number to the service name. Otherwise the service name should be only alphabetic +characters. + +Examples: + +- `clickhouse.prod.ooni.io` +- `postgres0.prod.ooni.io` +- `postgres1.prod.ooni.io` +- `prometheus.prod.ooni.io` +- `grafana.prod.ooni.io` + +Finally, the actual host which runs the service, should have a FQDN defined +inside of the `.nu` zone. + +This might not apply to every host, especially in a cloud environment. The FQDN +in the `.nu` are the ones which are going to be stored in the ansible inventory +file and will be used as targets for configuration management. + +The structure of these domains is: + +- `..[prod|test].ooni.nu` + +The location tag can be either just the provider name or provider name `-` the location. + +Here is a list of location tags: + +- `htz-fsn`: Hetzner on Falkenstein +- `htz-hel`: Hetzner in Helsinki +- `grh-ams`: Greenhost in Amsterdam +- `grh-mia`: Greenhost in Miami +- `aws-fra`: AWS in Europe (Frankfurt) + +Examples: + +- `monitoring.htz-fsn.prod.ooni.nu` diff --git a/docs/LegacyDocs.md b/docs/LegacyDocs.md new file mode 100644 index 00000000..ad68451c --- /dev/null +++ b/docs/LegacyDocs.md @@ -0,0 +1,232 @@ +# Legacy Docs + +**ATTENTION** this documentation speaks about topics that are still relevant, yet it may not be up to date with the currently defined best-practices or infrastructure status. + +### Creating new playbooks runbook + +**TODO** this needs to be rewritten to conform to the new policies + + +This runbook describe how to add new runbooks or modify existing runbooks to support new hosts. + +When adding a new host to an existing group, if no customization is required it is enough to modify `inventory` +and insert the hostname in the same locations as its peers. + +If the host requires small customization e.g. a different configuration file for the <>: + +1. add the hostname to `inventory` as described above +2. create "custom" blocks in `tasks/main.yml` to adapt the deployment steps to the new host using the `when:` syntax. + +For an example see: + +NOTE: Complex `when:` rules can lower the readability of `main.yml` + +When adding a new type of backend component that is different from anything already existing a new dedicated role can be created: + +1. add the hostname to `inventory` as described above +2. create a new playbook e.g. `ansible/deploy-newcomponent.yml` +3. copy files from an existing role into a new `ansible/roles/newcomponent` directory: + +- `ansible/roles/newcomponent/meta/main.yml` +- `ansible/roles/newcomponent/tasks/main.yml` +- `ansible/roles/newcomponent/templates/example_config_file` + +4. run `./play deploy-newcomponent.yml -l newhost.ooni.org --diff -C` and review the output +5. run `./play deploy-newcomponent.yml -l newhost.ooni.org --diff` and review the output + +Example: + +TIP: To ensure playbooks are robust and idemponent it can be beneficial to develop and test tasks incrementally by running the deployment commands often. + + +## Test helper rotation runbook +This runbook provides hints to troubleshoot the rotation of test +helpers. In this scenario test helpers are not being rotated as expected +and their TLS certificates might be at risk of expiring. + +Steps: + +1. Review [Test helpers](#comp:test_helpers), [Test helper rotation](#comp:test_helper_rotation) and [Test helpers notebook](#test-helpers-notebook) 📔 + +2. Review the charts on [Test helpers dashboard](#test-helpers-dashboard) 📊. + Look at different timespans: + + a. The uptime of the test helpers should be staggered by a week + depending on [Test helper rotation](#test-helper-rotation) ⚙. + +3. A summary of the live and last rotated test helper can be obtained + with: + +```sql +SELECT rdn, dns_zone, name, region, draining_at FROM test_helper_instances ORDER BY name DESC LIMIT 8 +``` + +4. The rotation tool can be started manually. It will always pick the + oldest host for rotation. ⚠️ Due to the propagation time of changes + in the DNS rotating many test helpers too quickly can impact the + probes. + + a. Log on [backend-fsn.ooni.org](#backend-fsn.ooni.org) 🖥 + + b. Check the last run using + `sudo systemctl status ooni-rotation.timer` + + c. Review the logs using `sudo journalctl -u ooni-rotation` + + d. Run `sudo systemctl restart ooni-rotation` and monitor the logs. + +5. Review the charts on [Test helpers dashboard](#test-helpers-dashboard) 📊 + during and after the rotation. + + +### Test helpers failure runbook +This runbook presents a scenario where a test helper is causing probes +to fail their tests sporadically. It describes how to identify the +affected host and mitigate the issue but can also be used to investigate +other issues affecting the test helpers. + +It has been chosen because such kind of incidents can impact the quality +of measurements and can be relatively difficult to troubleshoot. + +For investigating glitches in the +[test helper rotation](#test-helper-rotation) ⚙ see +[test helper rotation runbook](#test-helper-rotation-runbook) 📒. + +In this scenario either an alert has been sent to the +[#ooni-bots](#topic:oonibots) [Slack](#slack) 🔧 channel by +the [test helper failure rate notebook](#test-helper-failure-rate-notebook) 📔 or something +else caused the investigation. +See [Alerting](#alerting) 💡 for details. + +Steps: + +1. Review [Test helpers](#test-helpers) ⚙ + +2. Review the charts on [Test helpers dashboard](#test-helpers-dashboard) 📊. + Look at different timespans: + + a. The uptime of the test helpers should be staggered by a week + depending on [Test helper rotation](#test-helper-rotation) ⚙. + + b. The in-flight requests and requests per second should be + consistent across hosts, except for `0.th.ooni.org`. See + [Test helpers list](#test-helpers-list) 🐝 for details. + + c. Review CPU load, memory usage and run duration percentiles. + +3. Review [Test helper failure rate notebook](#test-helper-failure-rate-notebook) 📔 + +4. For more detailed investigation there is also a [test helper notebook](https://jupyter.ooni.org/notebooks/notebooks/2023%20%5Bfederico%5D%20test%20helper%20metadata%20in%20fastpath.ipynb) + +5. Log on the hosts using + `ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -Snone root@0.th.ooni.org` + +6. Run `journalctl --since '1 hour ago'` or review logs using the query + below. + +7. Run `top`, `strace`, `tcpdump` as needed. + +8. The rotation tool can be started at any time to rotate away failing + test helpers. The rotation script will always pick the oldest host + for rotation. ⚠️ Due to the propagation time of changes in the DNS + rotating many test helpers too quickly can impact the probes. + + a. Log on [backend-fsn.ooni.org](#backend-fsn.ooni.org) 🖥 + + b. Check the last run using + `sudo systemctl status ooni-rotation.timer` + + c. Review the logs using `sudo journalctl -u ooni-rotation` + + d. Run `sudo systemctl restart ooni-rotation` and monitor the logs. + +9. Review the charts on [Test helpers dashboard](#test-helpers-dashboard) 📊 + during and after the rotation. + +10. Summarize traffic hitting a test helper using the following commands: + + Top 10 miniooni probe IP addresses (Warning: this is sensitive data) + + `tail -n 100000 /var/log/nginx/access.log | grep miniooni | cut -d' ' -f1|sort|uniq -c|sort -nr|head` + + Similar, with anonimized IP addresses: + + `grep POST /var/log/nginx/access.log | grep miniooni | cut -d'.' -f1-3 | head -n 10000 |sort|uniq -c|sort -nr|head` + + Number of requests from miniooni probe in 10-minutes buckets: + + `grep POST /var/log/nginx/access.log | grep miniooni | cut -d' ' -f4 | cut -c1-17 | uniq -c` + + Number of requests from miniooni probe in 1-minute buckets: + + `grep POST /var/log/nginx/access.log | grep miniooni | cut -d' ' -f4 | cut -c1-18 | uniq -c` + + Number of requests grouped by hour, cache HIT/MISS/etc, software name and version + + `head -n 100000 /var/log/nginx/access.log | awk '{print $4, $6, $13}' | cut -c1-15,22- | sort | uniq -c | sort -n` + +To extract data from the centralized log database +on [monitoring.ooni.org](#monitoring.ooni.org) 🖥 you can use: + +``` sql +SELECT message FROM logs +WHERE SYSLOG_IDENTIFIER = 'oohelperd' +ORDER BY __REALTIME_TIMESTAMP DESC +LIMIT 10 +``` + +> **note** +> The table is indexed by `__REALTIME_TIMESTAMP`. Limiting the range by time can significantly increase query performance. + + +See [Selecting test helper for rotation](#selecting-test-helper-for-rotation) 🐞 + +## Legacy credentials store + +A private repository contains team +credentials, including username/password tuples, GPG keys and more. + +> **warning** +> The credential file is GPG-encrypted as `credentials.json.gpg`. Do not +> commit the cleartext `credentials.json` file. + +> **note** +> The credentials are stored in a JSON file to allow a flexible, +> hierarchical layout. This allow storing metadata like descriptions on +> account usage, dates of account creations, expiry, and credential +> rotation time. + +The tool checks JSON syntax and sorts keys automatically. + + +#### Listing file contents + + git pull + make show + +#### Editing contents + + git pull + make edit + git commit credentials.json.gpg -m "" + git push + +#### Extracting a credential programmatically: + + git pull + ./extract 'grafana.username' + +> **note** +> this can be used to automate credential retrieval from other tools, e.g. +> [Ansible](#ansible) 🔧 + +#### Updating users allowed to decrypt the credentials file + +Edit `makefile` to add or remove recipients (see `--recipient`) + +Then run: + + git pull + make decrypt encrypt + git commit makefile credentials.json.gpg + git push diff --git a/docs/MonitoringAlerts.md b/docs/MonitoringAlerts.md new file mode 100644 index 00000000..c4fb3b0b --- /dev/null +++ b/docs/MonitoringAlerts.md @@ -0,0 +1,612 @@ +# Monitoring and Alerts + +## Application metrics + All components of the backend are designed to output application + metrics. + + Metrics are prefixed with the name of each application. The metrics are + used in [Grafana](#grafana) 🔧 for charts, monitoring and alarming. + + They use the [StatsD](#statsd) 💡 protocol. + + Application metrics data flow: + + ![Diagram](https://kroki.io/blockdiag/svg/eNq9kc1qAyEUhffzFDLZNnGf0EBX7SoEkl0p4arXUaJe8QcKpe9eZ9Imkz5AXHo-OcdzhCN5VhYG9tUxhRqqK6dsICJ7ZolqUKgEfW469hKjsxKKpcDeJTlKjegXWmM7_UcjdlgUFJiro6Z1_8RMQj3emFJiXnM-2GKqWEnynChYLkCeMailIlk9hjL5cOFIcA82_OmnO33l1SJcTKcA-0Qei8GaH5shXn2nGK8JNIQH9zBcTKcA86mW29suDgS60T23d1ndjda4eX1X9O143B_-t9vg309uuu8fUvvJ0Q==) + + + + Ellipses represent data; rectangles represent processes. Purple + components belong to the backend. Click on the image and then click on + each shape to see related documentation. + + [Prometheus](#tool:prometheus) and [Grafana](#grafana) 🔧 provide + historical charts for more than 90 days and are useful to investigate + long-term trends. + + [Netdata](#netdata) 🔧 provides a web UI with real-time metrics. See + the dedicated subchapter for details. + + +### StatsD + All backend components send StatsD metrics over UDP using localhost as destination. + + This guarantees that applications never block on metric generation in + case the receiver slows down. The StatsD messages are received by + [Netdata](#netdata) 🔧. It automatically tracks any new metric, + generates averages and summaries as needed and exposes it to + [Prometheus](#prometheus) 🔧 for scraping. + In the codebase the statsd library is often used as: + + ```python + from .metrics import setup_metrics + setup_metrics(name="") + metrics.gauge("", ) + ``` + + Because of this, a quick way to identify where metrics are being generated + in the backend codebase is to search e.g.: + + * + * + + Where possible, timers have the same name as the function being timed e.g. + + + See [Conventions](#conventions) 💡 for patterns around component naming. + + +#### Metrics list + This subsection provides a list of the most important application metrics as they + are shown in Grafana. The names are autogenerated by Netdata based on the + metric name used in StatsD. + + For example a `@metrics.timer("generate_test_list")` Python decorator is used at: + . + Such timer will be processed by Netdata and appear in Grafana as: + ``` + netdata_statsd_timer_ooni_api_generate_test_list_milliseconds_average + ``` + + The metrics always start with `netdata_statsd` and end with: + + * `_milliseconds_average` + * `_events_persec_average` + * `_value_average` + + Also see + + TIP: StatsD collectors (like Netdata or others) preprocess datapoints by calculating average/min/max values etc. + + Run this to locate where in the backend codbase application metrics + are being generated: + + ```bash + find ~ -name '*.py' -exec grep 'metrics\.' -H "{}" \; + ``` + + Metrics for [ASN metadata updater](#asn-metadata-updater) ⚙. + See the [ASN metadata updater dashboard](#asn-metadata-updater-dashboard) 📊: + +``` +netdata_statsd_asnmeta_updater_asnmeta_tmp_len_gauge_value_average +netdata_statsd_asnmeta_updater_asnmeta_update_progress_gauge_value_average +netdata_statsd_asnmeta_updater_fetch_data_timer_milliseconds_average +netdata_statsd_gauge_asnmeta_updater_asnmeta_tmp_len_value_average +netdata_statsd_gauge_asnmeta_updater_asnmeta_update_progress_value_average +netdata_statsd_timer_asnmeta_updater_fetch_data_milliseconds_average +``` + + +Metrics for [CitizenLab test list updater](#citizenlab-test-list-updater) ⚙ + +``` +netdata_statsd_citizenlab_test_lists_updater_citizenlab_test_list_len_gauge_value_average +netdata_statsd_citizenlab_test_lists_updater_fetch_citizen_lab_lists_timer_milliseconds_average +netdata_statsd_citizenlab_test_lists_updater_update_citizenlab_table_timer_milliseconds_average +netdata_statsd_gauge_citizenlab_test_lists_updater_citizenlab_test_list_len_value_average +netdata_statsd_gauge_citizenlab_test_lists_updater_rowcount_value_average +netdata_statsd_timer_citizenlab_test_lists_updater_fetch_citizen_lab_lists_milliseconds_average +netdata_statsd_timer_citizenlab_test_lists_updater_rebuild_citizenlab_table_from_citizen_lab_lists_milliseconds_average +netdata_statsd_timer_citizenlab_test_lists_updater_update_citizenlab_table_milliseconds_average +``` + +Metrics for the [Database backup tool](#database-backup-tool) ⚙. +See the [Database backup dashboard](#database-backup-dashboard) 📊 on Grafana: + +``` +netdata_statsd_db_backup_run_export_timer_milliseconds_average +netdata_statsd_db_backup_status_gauge_value_average +netdata_statsd_db_backup_table_fastpath_backup_time_ms_gauge_value_average +netdata_statsd_db_backup_table_jsonl_backup_time_ms_gauge_value_average +netdata_statsd_db_backup_uploaded_bytes_tot_gauge_value_average +netdata_statsd_db_backup_upload_to_s3_timer_milliseconds_average +netdata_statsd_gauge_db_backup_status_value_average +netdata_statsd_gauge_db_backup_table_fastpath_backup_time_ms_value_average +netdata_statsd_gauge_db_backup_table_jsonl_backup_time_ms_value_average +netdata_statsd_gauge_db_backup_uploaded_bytes_tot_value_average +netdata_statsd_timer_db_backup_run_backup_milliseconds_average +netdata_statsd_timer_db_backup_run_export_milliseconds_average +netdata_statsd_timer_db_backup_upload_to_s3_milliseconds_average +netdata_statsd_gauge_db_backup_status_value_average +netdata_statsd_gauge_db_backup_table_citizenlab_byte_count_value_average +netdata_statsd_gauge_db_backup_table_fastpath_backup_time_ms_value_average +netdata_statsd_gauge_db_backup_table_fastpath_byte_count_value_average +netdata_statsd_gauge_db_backup_table_jsonl_backup_time_ms_value_average +netdata_statsd_gauge_db_backup_table_jsonl_byte_count_value_average +netdata_statsd_gauge_db_backup_uploaded_bytes_tot_value_average +netdata_statsd_timer_db_backup_backup_table_citizenlab_milliseconds_average +netdata_statsd_timer_db_backup_backup_table_fastpath_milliseconds_average +netdata_statsd_timer_db_backup_backup_table_jsonl_milliseconds_average +``` + + +Metrics for the [social media blocking event detector](#social-media-blocking-event-detector) ⚙: + +``` +netdata_statsd_gauge_detector_blocking_events_tblsize_value_average +netdata_statsd_gauge_detector_blocking_status_tblsize_value_average +netdata_statsd_timer_detector_run_detection_milliseconds_average +``` + + +Metrics for the [Fastpath](#fastpath) ⚙. Used in various dashboards, +primarily [API and fastpath](#api-and-fastpath) 📊 dashboard. + +``` +netdata_statsd_timer_fastpath_db_clickhouse_upsert_summary_milliseconds_average +netdata_statsd_timer_fastpath_db_fetch_fingerprints_milliseconds_average +netdata_statsd_timer_fastpath_full_run_milliseconds_average +netdata_statsd_gauge_fastpath_recent_measurement_count_value_average +``` + + +Metrics [Fingerprint updater](#fingerprint-updater) ⚙ +See the [Fingerprint updater dashboard](#fingerprint-updater-dashboard) 📊 on Grafana. + +``` +netdata_statsd_timer_fingerprints_updater_fetch_csv_milliseconds_average +netdata_statsd_gauge_fingerprints_updater_fingerprints_dns_tmp_len_value_average +netdata_statsd_gauge_fingerprints_updater_fingerprints_http_tmp_len_value_average +netdata_statsd_gauge_fingerprints_updater_fingerprints_update_progress_value_average +``` + +Metrics from Nginx caching of the aggregation API. +See [Aggregation cache monitoring](#aggregation-cache-monitoring) 🐍 + +``` +netdata_statsd_gauge_nginx_aggregation_cache_EXPIRED_value_average +netdata_statsd_gauge_nginx_aggregation_cache_HIT_value_average +netdata_statsd_gauge_nginx_aggregation_cache_MISS_value_average +netdata_statsd_gauge_nginx_aggregation_cache_UPDATING_value_average +``` + +Metrics for the [API](#api) ⚙. + +``` +netdata_statsd_counter_ooni_api_geoip_asn_differs_events_persec_average +netdata_statsd_counter_ooni_api_geoip_cc_differs_events_persec_average +netdata_statsd_counter_ooni_api_geoip_ipaddr_found_events_persec_average +netdata_statsd_counter_ooni_api_geoip_ipaddr_not_found_events_persec_average +netdata_statsd_counter_ooni_api_gunicorn_request_status_ +netdata_statsd_counter_ooni_api_probe_cc_asn_match_events_persec_average +netdata_statsd_counter_ooni_api_probe_cc_asn_nomatch_events_persec_average +netdata_statsd_counter_ooni_api_probe_legacy_login_successful_events_persec_average +netdata_statsd_counter_ooni_api_probe_login_successful_events_persec_average +netdata_statsd_counter_ooni_api_receive_measurement_count_events_persec_average +netdata_statsd_counter_ooni_api_receive_measurement_discard_asn_ +netdata_statsd_counter_ooni_api_receive_measurement_discard_cc_zz_events_persec_average +netdata_statsd_counter_ooni_api_uploader_msmt_count_events_persec_average +netdata_statsd_counter_ooni_api_uploader_postcan_count_events_persec_average +netdata_statsd_gauge_ooni_api_check_in_test_list_count_value_average +netdata_statsd_gauge_ooni_api_spool_post_count_value_average +netdata_statsd_gauge_ooni_api_test_list_urls_count_value_average +netdata_statsd_timer_ooni_api_apicall___api__v +netdata_statsd_timer_ooni_api_citizenlab_lock_time_milliseconds_average +netdata_statsd_timer_ooni_api_citizenlab_repo_init_milliseconds_average +netdata_statsd_timer_ooni_api_citizenlab_repo_pull_milliseconds_average +netdata_statsd_timer_ooni_api_fetch_citizenlab_data_milliseconds_average +netdata_statsd_timer_ooni_api_fetch_reactive_url_list_milliseconds_average +netdata_statsd_timer_ooni_api_generate_test_list_milliseconds_average +netdata_statsd_timer_ooni_api_get_aggregated_milliseconds_average +netdata_statsd_timer_ooni_api_get_measurement_meta_clickhouse_milliseconds_average +netdata_statsd_timer_ooni_api_get_measurement_meta_milliseconds_average +netdata_statsd_timer_ooni_api_get_raw_measurement_milliseconds_average +netdata_statsd_timer_ooni_api_get_torsf_stats_milliseconds_average +netdata_statsd_timer_ooni_api_gunicorn_request_duration_milliseconds_average +netdata_statsd_timer_ooni_api_open_report_milliseconds_average +netdata_statsd_timer_ooni_api_open_report_milliseconds_averageopen_report +netdata_statsd_timer_ooni_api_receive_measurement_milliseconds_average +netdata_statsd_timer_ooni_api_uploader_fill_jsonl_milliseconds_average +netdata_statsd_timer_ooni_api_uploader_fill_postcan_milliseconds_average +netdata_statsd_timer_ooni_api_uploader_total_run_time_milliseconds_average +netdata_statsd_timer_ooni_api_uploader_update_db_table_milliseconds_average +netdata_statsd_timer_ooni_api_uploader_upload_measurement_milliseconds_average +``` + +Metrics for the [GeoIP downloader](#geoip-downloader) ⚙. + +``` +netdata_statsd_gauge_ooni_download_geoip_geoip_asn_epoch_value_average +netdata_statsd_gauge_ooni_download_geoip_geoip_asn_node_cnt_value_average +netdata_statsd_gauge_ooni_download_geoip_geoip_cc_epoch_value_average +netdata_statsd_gauge_ooni_download_geoip_geoip_cc_node_cnt_value_average +netdata_statsd_timer_ooni_download_geoip_download_geoip_milliseconds_average +``` + +Metrics for the [test helper rotation](#test-helper-rotation) ⚙. + +``` +netdata_statsd_timer_rotation_create_le_do_ssl_cert_milliseconds_average +netdata_statsd_timer_rotation_deploy_ssl_cert_milliseconds_average +netdata_statsd_timer_rotation_destroy_drained_droplets_milliseconds_average +netdata_statsd_timer_rotation_end_to_end_test_milliseconds_average +netdata_statsd_timer_rotation_run_time_milliseconds_average +netdata_statsd_timer_rotation_scp_file_milliseconds_average +netdata_statsd_timer_rotation_setup_nginx_milliseconds_average +netdata_statsd_timer_rotation_setup_vector_milliseconds_average +netdata_statsd_timer_rotation_spawn_new_droplet_milliseconds_average +netdata_statsd_timer_rotation_ssh_reload_nginx_milliseconds_average +netdata_statsd_timer_rotation_ssh_restart_netdata_milliseconds_average +netdata_statsd_timer_rotation_ssh_restart_nginx_milliseconds_average +netdata_statsd_timer_rotation_ssh_restart_vector_milliseconds_average +netdata_statsd_timer_rotation_ssh_wait_droplet_warmup_milliseconds_average +netdata_statsd_timer_rotation_update_dns_records_milliseconds_average +``` + + +### Prometheus +Prometheus is a popular monitoring system and +runs on [monitoring.ooni.org](#monitoring.ooni.org) 🖥 + +It is deployed and configured by [Ansible](#ansible) 🔧 using the +following playbook: + + +Most of the metrics are collected by scraping Prometheus endpoints, +Netdata, and using node exporter. The web UI is accessible at + + +#### Blackbox exporter +Blackbox exporter is part of Prometheus. It's a daemon that performs HTTP +probing against other hosts without relying on local agents (hence the name Blackbox) +and feeds the generated datapoints into Promethous. + +See + +It is deployed by +[Ansible](#tool:ansible) on the [monitoring.ooni.org](#monitoring.ooni.org) 🖥 + +See +[Updating Blackbox Exporter runbook](#updating-blackbox-exporter-runbook) 📒 + + +### Grafana dashboards +There is a number of dashboards on [Grafana](#grafana) 🔧 at + + +[Grafana](#grafana) 🔧 is deployed on the +[monitoring.ooni.org](#monitoring.ooni.org) 🖥 host. See +[Monitoring deployment runbook](#monitoring-deployment-runbook) 📒 for deployment. + +The dashboards are used for: + + * Routinely reviewing the general health of the backend infrastructure + + * Predicting long-term scaling requirements, i.e. + + * increasing disk space for the database + + * increasing CPU and memory requirements + + * Investigating alerts and troubleshooting incidents + + +#### Alerting +Alerts from [Grafana](#tool:grafana) and [Prometheus](#prometheus) 🔧 +are sent to the [#ooni-bots](#topic:oonibots) [Slack](#slack) 🔧 +channel by a bot. + +[Slack](#slack) 🔧 can be configured to provide desktop notification +from browsers and audible notifications on smartphones. + +Alert flow: + +![Diagram](https://kroki.io/blockdiag/svg/eNp1jUEKwjAQRfc9xTBd9wSioBtxV3ApIpNmYktjJiQpCuLdTbvQIDirP7zH_8pKN-qBrvCsQLOhyaZL7MkzrCHI5DRrJY9VBW2QG6eepwinTqyELGDN-YzBcxb2gQw5-kOxFnFDoyRFLBVjZmlRioVm86nLEY-WuhG27QGXt6z6YvIef4dmugtyjxwye70BaPFK1w==) + + + +The diagram does not explicitly include alertmanager. It is part of Prometheus and receives alerts and routes them to Slack. + +More detailed diagram: + +```mermaid +flowchart LR + P(Prometheus) -- datapoints --> G(Grafana) + G --> A(Alertmanager) + A --> S(Slack API) --> O(#ooni-bots) + P --> A + O --> R(Browser / apps) + J(Jupyter notebook) --> A + classDef box fill:#eeffee,stroke:#777,stroke-width:2px; + class P,G,A,S,O,R,J box; +``` + +In the diagram Prometheus receives, stores and serves datapoints and has some alert rules to trigger alerts. +Grafana acts as a UI for Prometheus and also triggers alerts based on alert rules configured in Grafana itself. + +Alertmanager is pretty simple - receives alerts and sends notification to Slack. + +The alert rules are listed at +The list also shows which alerts are firing at the moment, if any. There +is also a handful of alerts configured in [Prometheus](#prometheus) 🔧 +using [Ansible](#ansible) 🔧. + +The silences list shows if any alert has been temporarily silenced: + + +See [Grafana editing](#grafana-editing) 📒 and +[Managing Grafana alert rules](#managing-grafana-alert-rules) 📒 for details. + +There are also many dashboards and alerts configured in +[Jupyter Notebook](#jupyter-notebook) 🔧. These are meant for metrics that require more +complex algorithms, predictions and SQL queries that cannot be +implemented using [Grafana](#grafana) 🔧 e.g. when using machine learning or Pandas. +See [Ooniutils microlibrary](#ooniutils-microlibrary) 💡 for details. + +On many dashboards you can set the averaging timespan and the target +hostname using fields on the top left. + +Here is an overview of the most useful dashboards: + + +#### API and fastpath + + +This is the most important dashboard showing metrics of the +[API](#comp:api) and the [Fastpath](#fastpath) ⚙. + + +#### Test-list repository in the API + + +This dashboard shows timings around the git repository checked out by the +[API](#api) ⚙ that contains the test lists. + + +#### Measurement uploader dashboard + + +This dashboard shows metrics, timing and amounts of data transferred by the +[Measurement uploader](#measurement-uploader) ⚙ + + +#### Fingerprint updater dashboard + + +This dashboard shows metrics and timing from the +[Fingerprint updater](#fingerprint-updater) ⚙ + + +#### ClickHouse dashboard + + +This dashboards show ClickHouse-specific performance metrics. +It can be used for optimizations. + +For investigating slow queries also see the [ClickHouse queries notebook](#clickhouse-queries-notebook) 📔. + + +#### HaProxy dashboard + + +Basic metrics from [HaProxy](#haproxy) ⚙ load balancers. Used for +[OONI bridges](#ooni-bridges) ⚙. + + +#### TLS certificate dashboard + + +Certificate expiration times. There are alerts configured in +[Grafana](#grafana) 🔧 to alert on expiring certificates. + + +#### Test helpers dashboard + + +Status, uptime and load metrics from the +[Test helpers](#test-helpers) ⚙. + + +#### Database backup dashboard + + +Metrics, timing and data transferred by +[Database backup tool](#database-backup-tool) ⚙ + +By looking at the last 24 hours of run you should be able to see the backup +being run + + +The "Status" chart shows the running status. +"Uploaded bytes in total" and "Backup time" should be self explanatory. + +TIP: If the backup time or size grows too much it could be worth alerting and considering implementing incremental backups. + + +#### Event detector dashboard + + +Basic metrics from the +[social media blocking event detector](#social-media-blocking-event-detector) ⚙ + + +#### GeoIP MMDB database dashboard + + +Age and size of the GeoIP MMDB database. Also, a chart showing +discrepancies between the lookup performed by the probes VS the one in +the API, used to gauge the benefits of using a centralized solution. + +Also see [Geolocation script](#geolocation-script) 🐍 + +See [GeoIP downloader](#geoip-downloader) ⚙ + + +#### Host clock offset dashboard + + +Measures NTP clock sync and alarms on big offsets + + +#### Netdata-specific dashboard + + +Shows all the metrics captured by [Netdata](#netdata) 🔧 - useful for +in-depth performance investigation. + + +#### ASN metadata updater dashboard + + +Progress, runtime and table size of the [ASN metadata updater](#asn-metadata-updater) ⚙ + +See [Metrics list](#metrics-list) 💡 + + +### Netdata +Netdata is a monitoring agent that runs +locally on the backend servers. It exports host and +[Application metrics](#topic:appmetrics) to [Prometheus](#prometheus) 🔧. + +It also provides a web UI that can be accessed on port 19999. It can be +useful during development, performance optimization and debugging as it +provides metrics with higher time granularity (1 second) and almost no +delay. + +Netdata is not exposed on the Internet for security reasons and can be +accessed only when nededed by setting up port forwarding using SSH. For +example: + +```bash +ssh ams-pg-test.ooni.org -L 19998:127.0.0.1:19999 +``` + +Netdata can also be run on a development desktop and be accessed locally +in order to explore application metrics without having to deploy +[Prometheus](#tool:prometheus) and [Grafana](#grafana) 🔧. + +See [Netdata-specific dashboard](#netdata-specific-dashboard) 📊 of an example of native +Netdata metrics. + + +## Log management +All components of the backend are designed to output logs to Systemd's +journald. They usually log using the component name as Systemd unit +name. + +Sometimes you might have to use `--identifier ` instead for +scripts that are not run as Systemd units. + +Journald automatically indexes logs by time, unit name and other items. +This allows to quickly filter logs during troubleshooting, for example: + +```bash +sudo journalctl -u ooni-api --since '10 m ago' +``` + +Or follow live logs using e.g.: + +```bash +sudo journalctl -u nginx -f +``` + +Sometimes it is useful to show milliseconds in the timestamps: + +```bash +sudo journalctl -f -u ooni-api -o short-precise +``` + +The logger used in Python components also sets additional fields, +notably CODE_FUNC and CODE_LINE + +Available fields can be listed using: + +```bash +sudo journalctl -f -u ooni-api -N | sort +``` + +It is possible to filter by those fields. It comes very handy for +debugging e.g.: + +```bash +sudo journalctl -f -u ooni-api CODE_FUNC=open_report +``` + +Every host running backend services also sends host to +monitoring.ooni.org using [Vector](#vector) 🔧. + +![Diagram](https://kroki.io/blockdiag/svg/eNrFks9qwzAMxu95CpNel_gYWOlgDEqfYJdRiv_IiYltBccphdF3n5yyNellt01H6ZO_nyxJh6rXVrTss2AajJhcOo2dGIDtWMQpaNASL9uCvQ6Ds0oki4FVL-wdVMJYjUCKuEhEUGDPt9QbNfQHnEZ46P9Q6DCSQ7kxBijKIynWTy40WWFM-cS6CGaXU11Kw_jMeWtTN8laoeeIwXIpVE_tlUY1eQhptuPSoeRe2PBdP63qtdeb8-y9xPgZ5N9A7t_3xwwqG3fZOHMUrKVDGPKBUCzWuF1vjIivD-LfboLCCQkuT-EJmcQ2tHWmrzG25U1yn71p9vumKWen6xdypu8x) + + +There is a dedicated ClickHouse instance on monitoring.ooni.org used to +collect logs. See the [ClickHouse instance for logs](#clickhouse-instance-for-logs) ⚙. +This is done to avoid adding unnecessary load to the production database +on FSN that contains measurements and also keep a copy of FSN's logs on +a different host. + +The receiving [Vector](#vector) 🔧 instance and ClickHouse are +deployed and configured by [Ansible](#ansible) 🔧 using the following +playbook: + + +See [Logs from FSN notebook](#logs-from-fsn-notebook) 📔 and +[Logs investigation notebook](#logs-investigation-notebook) 📔 + + +### Slack +[Slack](https://slack.com/) is used for team messaging and automated +alerts at the following instance: + + +#### #ooni-bots +`#ooni-bots` is a [Slack](#slack) 🔧 channel used for automated +alerts: diff --git a/docs/Runbooks.md b/docs/Runbooks.md new file mode 100644 index 00000000..550d6978 --- /dev/null +++ b/docs/Runbooks.md @@ -0,0 +1,1155 @@ +# Runbooks + +Below you will find runbooks for common tasks and operations to manage our infra. + +## Monitoring deployment runbook + +The monitoring stack is deployed and configured by +[Ansible](#tool:ansible) on the [monitoring.ooni.org](#monitoring.ooni.org) 🖥 +host using the following playbook: + + +It includes: + +- [Grafana](#grafana) 🔧 at + +- [Jupyter Notebook](#jupyter-notebook) 🔧 at + +- [Vector](#tool:vector) (see [Log management](#log-management) 💡) + +- local [Netdata](#tool:netdata), [Blackbox exporter](#blackbox-exporter) 🔧, etc + +- [Prometheus](#prometheus) 🔧 at + +It also configures the FQDNs: + +- loghost.ooni.org + +- monitoring.ooni.org + +- netdata.ooni.org + +This also includes the credentials to access the Web UIs. They are +deployed as `/etc/nginx/monitoring.htpasswd` from +`ansible/roles/monitoring/files/htpasswd` + +**Warning** the following steps are dangerously broken. Applying the changes +will either not work or worse break production. + +If you must do something of this sort, you will unfortunately have to resort of +specifying the particular substeps you want to run using the `-t` tag filter +(eg. `-t prometheus-conf` to update the prometheus configuration. + +Steps: + +1. Review [Ansible playbooks summary](#ansible-playbooks-summary) 📒, + [Deploying a new host](#run:newhost) [Grafana dashboards](#grafana-dashboards) 💡. + +2. Run `./play deploy-monitoring.yml -l monitoring.ooni.org --diff -C` + and review the output + +3. Run `./play deploy-monitoring.yml -l monitoring.ooni.org --diff` and + review the output + +## Updating Blackbox Exporter runbook + +This runbook describes updating [Blackbox exporter](#blackbox-exporter) 🔧. + +The `blackbox_exporter` role in ansible is pulled in by the `deploy-monitoring.yml` +runbook. + +The configuration file is at `roles/blackbox_exporter/templates/blackbox.yml.j2` +together with `host_vars/monitoring.ooni.org/vars.yml`. + +To add a simple HTTP[S] check, for example, you can copy the "ooni website" block. + +Edit it and run the deployment of the monitoring stack as described in the previous subchapter. + +## Deploying a new host + +To deploy a new host: + +1. Choose a FQDN like \$name.ooni.org based on the + [DNS naming policy](#dns-naming-policy) 💡 + +2. Deploy the physical host or VM using Debian Stable + +3. Create `A` and `AAAA` records for the FQDN in the Namecheap web UI + +4. Follow [Updating DNS diagrams](#updating-dns-diagrams) 📒 + +5. Review the `inventory` file and git-commit it + +6. Deploy the required stack. Run ansible it test mode first. For + example this would deploy a backend host: + + ./play deploy-backend.yml --diff -l .ooni.org -C + ./play deploy-backend.yml --diff -l .ooni.org + +7. Update [Prometheus](#prometheus) 🔧 by following + [Monitoring deployment runbook](#monitoring-deployment-runbook) 📒 + +8. git-push the commits + +Also see [Monitoring deployment runbook](#monitoring-deployment-runbook) 📒 for an +example of deployment. + +## Deleting a host + +1. Remove it from `inventory` + +2. Update the monitoring deployment using: + +``` +./play deploy-monitoring.yml -t prometheus-conf -l monitoring.ooni.org --diff +``` + +## Weekly measurements review runbook + +On a daily or weekly basis the following dashboards and Jupyter notebooks can be reviewed to detect unexpected patterns in measurements focusing on measurement drops, slowdowns or any potential issue affecting the backend infrastructure. + +When browsing the dashboards expand the time range to one year in order to spot long term trends. +Also zoom in to the last month to spot small glitches that could otherwise go unnoticed. + +Review the [API and fastpath](#api-and-fastpath) 📊 dashboard for the production backend host[s] for measurement flow, CPU and memory load, +timings of various API calls, disk usage. + +Review the [Incoming measurements notebook](#incoming-measurements-notebook) 📔 for unexpected trends. + +Quickly review the following dashboards for unexpected changes: + + * [Long term measurements prediction notebook](#long-term-measurements-prediction-notebook) 📔 + * [Test helpers dashboard](#test-helpers-dashboard) 📊 + * [Test helper failure rate notebook](#test-helper-failure-rate-notebook) 📔 + * [Database backup dashboard](#database-backup-dashboard) 📊 + * [GeoIP MMDB database dashboard](#geoip-mmdb-database-dashboard) 📊 + * [GeoIP dashboard](#geoip-mmdb-database-dashboard) 📊 + * [Fingerprint updater dashboard](#fingerprint-updater-dashboard) 📊 + * [ASN metadata updater dashboard](#asn-metadata-updater-dashboard) 📊 + +Also check for glitches like notebooks not being run etc. + + +## Grafana backup runbook +This runbook describes how to back up dashboards and alarms in Grafana. +It does not include backing up datapoints stored in +[Prometheus](#prometheus) 🔧. + +The Grafana SQLite database can be dumped by running: + +```bash +sqlite3 -line /var/lib/grafana/grafana.db '.dump' > grafana_dump.sql +``` + +Future implementation is tracked in: +[Implement Grafana dashboard and alarms backup](#implement-grafana-dashboard-and-alarms-backup) 🐞 + + +## Grafana editing +This runbook describes adding new dashboards, panels and alerts in +[Grafana](#grafana) 🔧 + +To add a new dashboard use this + + +To add a new panel to an existing dashboard load the dashboard and then +click the \"Add\" button on the top. + +Many dashboards use variables. For example, on + +the variables `$host` and `$avgspan` are set on the top left and used in +metrics like: + + avg_over_time(netdata_disk_backlog_milliseconds_average{instance="$host:19999"}[$avgspan]) + + +### Managing Grafana alert rules +Alert rules can be listed at + +> **note** +> The list also shows which alerts are currently alarming, if any. + +Click the arrow on the left to expand each alerting rule. + +The list shows: + +![editing_alerts](../../../assets/images-backend/grafana_alerts_editing.png) + +> **note** +> When creating alerts it can be useful to add full URLs linking to +> dashboards, runbooks etc. + +To stop notifications create a \"silence\" either: + +1. by further expanding an alert rule (see below) and clicking the + \"Silence\" button + +2. by inputting it in + +Screenshot: + +![adding_silence](../../../assets/images-backend/grafana_alerts_silence.png) + +Additionally, the \"Show state history\" button is useful especially +with flapping alerts. + + +### Adding new fingerprints +This is performed on + +Updates are fetched automatically by +[Fingerprint updater](#fingerprint-updater) ⚙ + +Also see [Fingerprint updater dashboard](#fingerprint-updater-dashboard) 📊. + + +### Backend code changes +This runbook describes making changes to backend components and +deploying them. + +Summary of the steps: + +1. Check out the backend repository. + +2. Create a dedicated branch. + +3. Update `debian/changelog` in the component you want to monify. See + [Package versioning](#package-versioning) 💡 for details. + +4. Run unit/functional/integ tests as needed. + +5. Create a pull request. + +6. Ensure the CI workflows are successful. + +7. Deploy the package on the testbed [ams-pg-test.ooni.org](#ams-pg-test.ooni.org) 🖥 + and verify the change works as intended. + +8. Add a comment the PR with the deployed version and stage. + +9. Wait for the PR to be approved. + +10. Deploy the package to production on + [backend-fsn.ooni.org](#backend-fsn.ooni.org) 🖥. Ensure it is the same version + that has been used on the testbed. See [API runbook](#api-runbook) 📒 for + deployment steps. + +11. Add a comment the PR with the deployed version and stage, then merge + the PR. + +When introducing new metrics: + +1. Create [Grafana](#grafana) 🔧 dashboards, alerts and + [Jupyter Notebook](#jupyter-notebook) 🔧 and link them in the PR. + +2. Collect and analize metrics and logs from the testbed stages before + deploying to production. + +3. Test alarming by simulating incidents. +### Backend component deployment +This runbook provides general steps to deploy backend components on +production hosts. + +Review the package changelog and the related pull request. + +The amount of testing and monitoring required depends on: + +1. the impact of possible bugs in terms of number of users affected and + consequences + +2. the level of risk involved in rolling back the change, if needed + +3. the complexity of the change and the risk of unforeseen impact + +Monitor the [API and fastpath](#api-and-fastpath) 📊 and dedicated . Review past +weeks for any anomaly before starting a deployment. + +Ensure that either the database schema is consistent with the new +deployment by creating tables and columns manually, or that the new +codebase is automatically updating the database. + +Quickly check past logs. + +Follow logs with: + +``` bash +sudo journalctl -f --no-hostname +``` + +While monitoring the logs, deploy the package using the +[The deployer tool](#the-deployer-tool) 🔧 tool. (Details on the tool subchapter) + + +### API runbook +This runbook describes making changes to the [API](#api) ⚙ and +deploying it. + +Follow [Backend code changes](#backend-code-changes) 📒 and +[Backend component deployment](#backend-component-deployment) 📒. + +In addition, monitor logs from Nginx and API focusing on HTTP errors and +failing SQL queries. + +Manually check [Explorer](#explorer) 🖱 and other +[Public and private web UIs](#public-and-private-web-uis) 💡 as needed. + + +#### Managing feature flags +To change feature flags in the API a simple pull request like + is enough. + +Follow [Backend code changes](#backend-code-changes) 📒 and deploy it after +basic testing on [ams-pg-test.ooni.org](#ams-pg-test.ooni.org) 🖥. + + +### Running database queries +This subsection describes how to run queries against +[ClickHouse](#clickhouse) ⚙. You can run queries from +[Jupyter Notebook](#jupyter-notebook) 🔧 or from the CLI: + +```bash + ssh + $ clickhouse-client +``` + +Prefer using the default user when possible. To log in as admin: + +```bash + $ clickhouse-client -u admin --password +``` + +> **note** +> Heavy queries can impact the production database. When in doubt run them +> on the CLI interface in order to terminate them using CTRL-C if needed. + +> **warning** +> ClickHouse is not transactional! Always test queries that mutate schemas +> or data on testbeds like [ams-pg-test.ooni.org](#ams-pg-test.ooni.org) 🖥 + +For long running queries see the use of timeouts in +[Fastpath deduplication](#fastpath-deduplication) 📒 + +Also see [Dropping tables](#dropping-tables) 📒, +[Investigating table sizes](#investigating-table-sizes) 📒 + + +#### Modifying the fastpath table +This runbook show an example of changing the contents of the +[fastpath table](#fastpath-table) ⛁ by running a \"mutation\" query. + +> **warning** +> This method creates changes that cannot be reproduced by external +> researchers by [Reprocessing measurements](#reprocessing-measurements) 📒. See +> [Reproducibility](#reproducibility) 💡 + +In this example [Signal test](#signal-test) Ⓣ measurements are being +flagged as failed due to + +Summarize affected measurements with: + +``` sql +SELECT test_version, msm_failure, count() +FROM fastpath +WHERE test_name = 'signal' AND measurement_start_time > '2023-11-06T16:00:00' +GROUP BY msm_failure, test_version +ORDER BY test_version ASC +``` + +> **important** +> `ALTER TABLE …​ UPDATE` starts a +> [mutation](https://clickhouse.com/docs/en/sql-reference/statements/alter#mutations) +> that runs in background. + +Check for any running or stuck mutation: + +``` sql +SELECT * FROM system.mutations WHERE is_done != 1 +``` + +Start the mutation: + +``` sql +ALTER TABLE fastpath +UPDATE + msm_failure = 't', + anomaly = 'f', + scores = '{"blocking_general":0.0,"blocking_global":0.0,"blocking_country":0.0,"blocking_isp":0.0,"blocking_local":0.0,"accuracy":0.0,"msg":"bad test_version"}' +WHERE test_name = 'signal' +AND measurement_start_time > '2023-11-06T16:00:00' +AND msm_failure = 'f' +``` + +Run the previous `SELECT` queries to monitor the mutation and its +outcome. + + +### Updating tor targets +See [Tor targets](#tor-targets) 🐝 for a general description. + +Review the [Ansible](#ansible) 🔧 chapter. Checkout the repository and +update the file `ansible/roles/ooni-backend/templates/tor_targets.json` + +Commit the changes and deploy as usual: + + ./play deploy-backend.yml --diff -l ams-pg-test.ooni.org -t api -C + ./play deploy-backend.yml --diff -l ams-pg-test.ooni.org -t api + +Test the updated configuration, then: + + ./play deploy-backend.yml --diff -l backend-fsn.ooni.org -t api -C + ./play deploy-backend.yml --diff -l backend-fsn.ooni.org -t api + +git-push the changes. + +Implements [Document Tor targets](#document-tor-targets) 🐞 + + +### Creating admin API accounts +See [Auth](#auth) 🐝 for a description of the API entry points related +to account management. + +The API provides entry points to: + + * [get role](https://api.ooni.io/apidocs/#/default/get_api_v1_get_account_role__email_address_) + + * [set role](https://api.ooni.io/apidocs/#/default/post_api_v1_set_account_role). + +The latter is implemented +[here](https://github.com/ooni/backend/blob/0ec9fba0eb9c4c440dcb7456f2aab529561104ae/api/ooniapi/auth.py#L437). + +> **important** +> The default value for API accounts is `user`. For such accounts there is +> no need for a record in the `accounts` table. + +To change roles it is required to be authenticated and have a role as +`admin`. + +It is also possible to create or update roles by running SQL queries +directly on [ClickHouse](#clickhouse) ⚙. This can be necessary to +create the initial `admin` account on a new deployment stage. + +A quick way to identify the account ID an user is to extract logs from +the [API](#api) ⚙ either from the backend host or using +[Logs from FSN notebook](#logs-from-fsn-notebook) 📔 + +```bash +sudo journalctl --since '5 min ago' -u ooni-api | grep 'SELECT role FROM accounts WHERE account_id' -C5 +``` + +Example output: + + Nov 09 16:03:00 backend-fsn ooni-api[1763457]: DEBUG Query: SELECT role FROM accounts WHERE account_id = '' + +Then on the database test host: + +```bash +clickhouse-client +``` + +Then in the ClickHouse shell insert a record to give\`admin\` role to +the user. See [Running database queries](#running-database-queries) 📒: + +```sql +INSERT INTO accounts (account_id, role) VALUES ('', 'admin') +``` + +`accounts` is an EmbeddedRocksDB table with `account_id` as primary key. +No record deduplication is necessary. + +To access the new role the user has to log out from web UIs and login +again. + +> **important** +> Account IDs are not the same across test and production instances. + +This is due to the use of a configuration variable +`ACCOUNT_ID_HASHING_KEY` in the hashing of the email address. The +parameter is read from the API configuration file. The values are +different across deployment stages as a security feature. + + +### Fastpath runbook + +#### Fastpath code changes and deployment +Review [Backend code changes](#backend-code-changes) 📒 and +[Backend component deployment](#backend-component-deployment) 📒 for changes and deployment of the +backend stack in general. + +Also see [Modifying the fastpath table](#modifying-the-fastpath-table) 📒 + +In addition, monitor logs and [Grafana dashboards](#grafana-dashboards) 💡 +focusing on changes in incoming measurements. + +You can use the [The deployer tool](#the-deployer-tool) 🔧 tool to perform +deployment and rollbacks of the [Fastpath](#fastpath) ⚙. + +> **important** +> the fastpath is configured **not** to restart automatically during +> deployment. + +Always monitor logs and restart it as needed: + +```bash +sudo systemctl restart fastpath +``` + + +#### Fastpath manual deployment +Sometimes it can be useful to run APT directly: + +```bash +ssh +sudo apt-get update +apt-cache show fastpath | grep Ver | head -n5 +sudo apt-get install fastpath= +``` + + +#### Reprocessing measurements +Reprocess old measurement by running the fastpath manually. This can be +done without shutting down the fastpath instance running on live +measurements. + +You can run the fastpath as root or using the fastpath user. Both users +are able to read the configuration file under `/etc/ooni`. The fastpath +will download [Postcans](#postcans) 💡 in the local directory. + +`fastpath -h` generates: + + usage: + OONI Fastpath + + See README.adoc + + [-h] [--start-day START_DAY] [--end-day END_DAY] + [--devel] [--noapi] [--stdout] [--debug] + [--db-uri DB_URI] + [--clickhouse-url CLICKHOUSE_URL] [--update] + [--stop-after STOP_AFTER] [--no-write-to-db] + [--keep-s3-cache] [--ccs CCS] + [--testnames TESTNAMES] + + options: + -h, --help show this help message and exit + --start-day START_DAY + --end-day END_DAY + --devel Devel mode + --noapi Process measurements from S3 and do not start API feeder + --stdout Log to stdout + --debug Log at debug level + --clickhouse-url CLICKHOUSE_URL + ClickHouse url + --stop-after STOP_AFTER + Stop after feeding N measurements from S3 + --no-write-to-db Do not insert measurement in database + --ccs CCS Filter comma-separated CCs when feeding from S3 + --testnames TESTNAMES + Filter comma-separated test names when feeding from S3 (without + underscores) + +To run the fastpath manually use: + + ssh + sudo sudo -u fastpath /bin/bash + + fastpath --help + fastpath --start-day 2023-08-14 --end-day 2023-08-19 --noapi --stdout + +The `--no-write-to-db` option can be useful for testing. + +The `--ccs` and `--testnames` flags are useful to selectively reprocess +measurements. + +After reprocessing measurements it's recommended to manually deduplicate +the contents of the `fastpath` table. See +[Fastpath deduplication](#fastpath-deduplication) 📒 + +> **note** +> it is possible to run multiple `fastpath` processes using +> with different time ranges. +> Running the reprocessing under `byobu` is recommended. + +The fastpath will pull [Postcans](#postcans) 💡 from S3. See +[Feed fastpath from JSONL](#feed-fastpath-from-jsonl) 🐞 for possible speedup. + + +#### Fastpath monitoring +The fastpath pipeline can be monitored using the +[Fastpath dashboard](#dash:api_fp) and [API and fastpath](#api-and-fastpath) 📊. + +Also follow real-time process using: + + sudo journalctl -f -u fastpath + + +### Android probe release runbook +This runbook is meant to help coordinate Android probe releases between +the probe and backend developers and public announcements. It does not +contain detailed instructions for individual components. + +Also see the [Measurement drop runbook](#measurement-drop-tutorial) 📒. + + +Roles: \@probe, \@backend, \@media + + +#### Android pre-release +\@probe: drive the process involving the other teams as needed. Create +calendar events to track the next steps. Run the probe checklist + + +\@backend: review + +and + +for long-term trends + + +#### Android release +\@probe: release the probe for early adopters + +\@backend: monitor + +frequently during the first 24h and report any drop on +[Slack](#slack) 🔧 + +\@probe: wait at least 24h then release the probe for all users + +\@backend: monitor + +daily for 14 days and report any drop on [Slack](#slack) 🔧 + +\@probe: wait at least 24h then poke \@media to announce the release + +( + + +### CLI probe release runbook +This runbook is meant to help coordinate CLI probe releases between the +probe and backend developers and public announcements. It does not +contain detailed instructions for individual components. + +Roles: \@probe, \@backend, \@media + + +#### CLI pre-release +\@probe: drive the process involving the other teams as needed. Create +calendar events to track the next steps. Run the probe checklist and +review the CI. + +\@backend: review +\[jupyter\]() +and +\[grafana\]() +for long-term trends + + +#### CLI release +\@probe: release the probe for early adopters + +\@backend: monitor +\[jupyter\]() +frequently during the first 24h and report any drop on +[Slack](#slack) 🔧 + +\@probe: wait at least 24h then release the probe for all users + +\@backend: monitor +\[jupyter\]() +daily for 14 days and report any drop on [Slack](#slack) 🔧 + +\@probe: wait at least 24h then poke \@media to announce the release + + +### Investigating heavy aggregation queries runbook +In the following scenario the [Aggregation and MAT](#aggregation-and-mat) 🐝 API is +experiencing query timeouts impacting users. + +Reproduce the issue by setting a large enough time span on the MAT, +e.g.: + + +Click on the link to JSON, e.g. + + +Review the [backend-fsn.ooni.org](#backend-fsn.ooni.org) 🖥 metrics on + +(see [Netdata-specific dashboard](#netdata-specific-dashboard) 📊 for details) + +Also review the [API and fastpath](#api-and-fastpath) 📊 dashboard, looking at +CPU load, disk I/O, query time, measurement flow. + +Also see [Aggregation cache monitoring](#aggregation-cache-monitoring) 🐍 + +Refresh and review the charts on the [ClickHouse queries notebook](#clickhouse-queries-notebook) 📔. + +In this instance frequent calls to the aggregation API are found. + +Review the summary of the API quotas. See +[Calling the API manually](#calling-the-api-manually) 📒 for details: + + $ http https://api.ooni.io/api/_/quotas_summary Authorization:'Bearer ' + +Log on [backend-fsn.ooni.org](#backend-fsn.ooni.org) 🖥 and review the logs: + + backend-fsn:~$ sudo journalctl --since '5 min ago' + +Summarize the subnets calling the API: + + backend-fsn:~$ sudo journalctl --since '5 hour ago' -u ooni-api -u nginx | grep aggreg | cut -d' ' -f 8 | sort | uniq -c | sort -nr | head + + 807 + 112 + 92 + 38 + 16 + 15 + 11 + 11 + 10 + +To block IP addresses or subnets see [Nginx](#nginx) ⚙ or +[HaProxy](#haproxy) ⚙, then configure the required file in +[Ansible](#ansible) 🔧 and deploy. + +Also see [Limiting scraping](#limiting-scraping) 📒. + + +### Aggregation cache monitoring +To monitor cache hit/miss ratio using StatsD metrics the following +script can be run as needed. + +See [Metrics list](#metrics-list) 💡. + +``` python +import subprocess + +import statsd +metrics = statsd.StatsClient('localhost', 8125) + +def main(): + cmd = "sudo journalctl --since '5 min ago' -u nginx | grep 'GET /api/v1/aggregation' | cut -d ' ' -f 10 | sort | uniq -c" + out = subprocess.check_output(cmd, shell=True) + for line in out.splitlines(): + cnt, name = line.strip().split() + name = name.decode() + metrics.gauge(f"nginx_aggregation_cache_{name}", int(cnt)) + +if __name__ == '__main__': + main() +``` + + +### Limiting scraping +Aggressive bots and scrapers can be limited using a combination of +methods. Listed below ordered starting from the most user-friendly: + +1. Reduce the impact on the API (CPU, disk I/O, memory usage) by + caching the results. + +2. [Rate limiting and quotas](#rate-limiting-and-quotas) 🐝 already built in the API. It + might need lowering of the quotas. + +3. Adding API entry points to [Robots.txt](#robots.txt) 🐝 + +4. Adding specific `User-Agent` entries to [Robots.txt](#robots.txt) 🐝 + +5. Blocking IP addresses or subnets in the [Nginx](#nginx) ⚙ or + [HaProxy](#haproxy) ⚙ configuration files + +To add caching to the API or increase the expiration times: + +1. Identify API calls that cause significant load. [Nginx](#nginx) ⚙ + is configured to log timing information for each HTTP request. See + [Logs investigation notebook](#logs-investigation-notebook) 📔 for examples. Also see + [Logs from FSN notebook](#logs-from-fsn-notebook) 📔 and + [ClickHouse instance for logs](#clickhouse-instance-for-logs) ⚙. Additionally, + [Aggregation cache monitoring](#aggregation-cache-monitoring) 🐍 can be tweaked for the present use-case. + +2. Implement caching or increase expiration times across the API + codebase. See [API cache](#api-cache) 💡 and + [Purging Nginx cache](#purging-nginx-cache) 📒. + +3. Monitor the improvement in terms of cache hit VS cache miss ratio. + +> **important** +> Caching can be applied selectively for API requests that return rapidly +> changing data VS old, stable data. See [Aggregation and MAT](#aggregation-and-mat) 🐝 +> for an example. + +To update the quotas edit the API here + +and deploy as usual. + +To update the `robots.txt` entry point see [Robots.txt](#robots.txt) 🐝 and +edit the API here +*init*.py#L124 +and deploy as usual + +To block IP addresses or subnets see [Nginx](#nginx) ⚙ or +[HaProxy](#haproxy) ⚙, then configure the required file in +[Ansible](#ansible) 🔧 and deploy. + + +### Calling the API manually +To make HTTP calls to the API manually you'll need to extact a JWT from +the browser, sometimes with admin rights. + +In Firefox, authenticate against , then +open Inspect \>\> Storage \>\> Local Storage \>\> Find +`{"token": ""}` + +Extract the token ascii-encoded string without braces nor quotes. + +Call the API using [httpie](https://httpie.io/) with: + + $ http https://api.ooni.io/ Authorization:'Bearer ' + +E.g.: + + $ http https://api.ooni.io/api/_/quotas_summary Authorization:'Bearer ' + +> **note** +> Do not leave whitespaces after \"Authorization:\" + + +### Build, deploy, rollback + +Host deployments are done with the +[sysadmin repo](https://github.com/ooni/sysadmin) + +For component updates a deployment pipeline is used: + +Look at the \[Status +dashboard\]() - be aware +of badge image caching + + +### The deployer tool +Deployments can be performed with a tool that acts as a frontend for +APT. It implements a simple Continuous Delivery workflow from CLI. It +does not require running a centralized CD pipeline server (e.g. like +) + +The tool is hosted on the backend repository together with its +configuration file for simplicity: + + +At start time it traverses the path from the current working directory +back to root until it finds a configuration file named deployer.ini This +allows using different deployment pipelines stored in configuration +files across different repositories and subdirectories. + +The tool connects to the hosts to perform deployments and requires sudo +rights. It installs Debian packages from repositories already configured +on the hosts. + +It runs `apt-get update` and then `apt-get install …​` to update or +rollback packages. By design, it does not interfere with manual +execution of apt-get or through tools like [Ansible](#ansible) 🔧. +This means operators can log on a host to do manual upgrade or rollback +of packages without breaking the deployer tool. + +The tool depends only on the `python3-apt` package. + +Here is a configuration file example, with comments: + +``` ini +[environment] +## Location on the path where SVG badges are stored +badges_path = /var/www/package_badges + + +## List of packages that are handled by the deployer, space separated +deb_packages = ooni-api fastpath analysis detector + + +## List of deployment stage names, space separated, from the least to the most critical +stages = test hel prod + + +## For each stage a block named stage: is required. +## The block lists the stage hosts. + + +## Example of an unused stage (not list under stages) +[stage:alpha] +hosts = localhost + +[stage:test] +hosts = ams-pg-test.ooni.org + +[stage:hel] +hosts = backend-hel.ooni.org + +[stage:prod] +hosts = backend-fsn.ooni.org +``` + +By running the tool without any argument it will connect to the hosts +from the configuration file and print a summary of the installed +packages, for example: + +``` bash +$ deployer + + Package test prod +ooni-api 1.0.79~pr751-194 1.0.79~pr751-194 +fastpath 0.81~pr748-191 ►► 0.77~pr705-119 +analysis 1.9~pr659-61 ⚠ 1.10~pr692-102 +detector 0.3~pr651-98 0.3~pr651-98 +``` + +The green arrows between two package versions indicates that the version +on the left side is higher than the one on the right side. This means +that a rollout is pending. In the example the fastpath package on the +\"prod\" stage can be updated. + +A red warning sign indicates that the version on the right side is +higher than the one on the left side. During a typical continuous +deployment workflow version numbers should always increment The rollout +should go from left to right, aka from the least critical stage to the +most critical stage. + +Deploy/rollback a given version on the \"test\" stage: + +``` bash +./deployer deploy ooni-api test 0.6~pr194-147 +``` + +Deploy latest build on the first stage: + +``` bash +./deployer deploy ooni-api +``` + +Deploy latest build on a given stage. This usage is not recommended as +it deploys the latest build regardless of what is currently running on +previous stages. + +``` bash +./deployer deploy ooni-api prod +``` + +The deployer tool can also generate SVG badges that can then served by +[Nginx](#nginx) ⚙ or copied elsewhere to create a status dashboard. + +Example: + +![badge](../../../assets/images-backend/badge.png) + +Update all badges with: + +``` bash +./deployer refresh_badges +``` + + +### Adding new tests +This runbook describes how to add support for a new test in the +[Fastpath](#fastpath) ⚙. + +Review [Backend code changes](#backend-code-changes) 📒, then update +[fastpath core](https://github.com/ooni/backend/blob/0ec9fba0eb9c4c440dcb7456f2aab529561104ae/fastpath/fastpath/core.py) +to add a scoring function. + +See for example `def score_torsf(msm: dict) → dict:` + +Also add an `if` block to the `def score_measurement(msm: dict) → dict:` +function to call the newly created function. + +Finish by adding a new test to the `score_measurement` function and +adding relevant integration tests. + +Run the integration tests locally. + +Update the +[api](https://github.rom/ooni/backend/blob/0ec9fba0eb9c4c440dcb7456f2aab529561104ae/api/ooniapi/measurements.py#L491) +if needed. + +Deploy on [ams-pg-test.ooni.org](#ams-pg-test.ooni.org) 🖥 and run end-to-end tests +using real probes. + + +### Adding support for a new test key +This runbook describes how to modify the [Fastpath](#fastpath) ⚙ +and the [API](#api) ⚙ to extract, process, store and publish a new measurement +field. + +Start with adding a new column to the [fastpath table](#fastpath-table) ⛁ +by following [Adding a new column to the fastpath](#adding-a-new-column-to-the-fastpath) 📒. + +Add the column to the local ClickHouse instance used for tests and +[ams-pg-test.ooni.org](#ams-pg-test.ooni.org) 🖥. + +Update as described in +[Continuous Deployment: Database schema changes](#continuous-deployment:-database-schema-changes) 💡 + +Add support for the new field in the fastpath `core.py` and `db.py` modules +and related tests. +See https://github.com/ooni/backend/pull/682 for a comprehensive example. + +Run tests locally, then open a draft pull request and ensure the CI tests are +running successfully. + +If needed, the current pull request can be reviewed and deployed without modifying the API to expose the new column. This allows processing data sooner while the API is still being worked on. + +Add support for the new column in the API. The change depends on where and how the +new value is to be published. +See for a generic example of updating an SQL query in the API and updating related tests. + +Deploy the changes on test and pre-production stages after creating the new column in the database. +See [The deployer tool](#the-deployer-tool) 🔧 for details. + +Perform end-to-end tests with real probes and [Public and private web UIs](#public-and-private-web-uis) 💡 as needed. + +Complete the pull request and deploy to production. + + +## Increasing the disk size on a dedicated host + +Below are some notes on how to resize the disks when a new drive is added to +our dedicated hosts: + +``` +fdisk /dev/nvme3n1 +# create gpt partition table and new RAID 5 (label 42) partition using the CLI +mdadm --manage /dev/md3 --add /dev/nvme3n1p1 +cat /proc/mdstat +# Take note of the volume count (4) and validate that nvme3n1p1 is marked as spare ("S") +mdadm --grow --raid-devices=4 /dev/md3 +``` + +``` +# resize2fs /dev/md3 +# df -h | grep md3 +/dev/md3 2.6T 1.2T 1.3T 48% / +``` + +## Replicating MergeTree tables + +Notes on how to go about converting a MergeTree family table to a replicated table, while minimizing downtime. + +See the following links for more information: + +- https://kb.altinity.com/altinity-kb-setup-and-maintenance/altinity-kb-converting-mergetree-to-replicated/ +- https://clickhouse.com/docs/en/operations/system-tables/replicas +- https://clickhouse.com/docs/en/architecture/replication#verify-that-clickhouse-keeper-is-running +- https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication +- https://clickhouse.com/docs/en/operations/server-configuration-parameters/settings + +### Workflow + +You should first create the replicated database cluster following the +instructions at the [clickhouse docs](https://clickhouse.com/docs/en/architecture/replication). + +The ooni-devops repo has a role called `oonidata_clickhouse` that does that by using the [idealista.clickhouse_role](https://github.com/idealista/clickhouse_role). + +Once the cluster is created you can proceed with creating a DATABASE on the cluster by running: + +``` +CREATE DATABASE ooni ON CLUSTER oonidata_cluster +``` + +There are now a few options to go about doing this: + +1. You just create the new replicated tables and perform a copy into the destination database by running on the source database the following: + +``` +INSERT INTO FUNCTION +remote('destination-database.ooni.nu', 'obs_web', 'USER', 'PASSWORD') +SELECT * from obs_web +``` + +This will require duplicating the data and might not be feasible. + +2. If you already have all the data setup on one host and you just want to convert the database into a replicate one, you can do the following: + +We assume there are 2 tables: `obs_web_bak` (which is the source table) and +`obs_web` which is the destination table. We also assume a single shard and +multiple replicas. + +First create the destination replicated table. To retrieve the table create query you can run: + +```sql +select create_table_query +from system.tables +where database = 'default' and table = 'obs_web' +``` + +You should then modify the table to make use of the `ReplicateReplacingMergeTree` engine: + +```sql +CREATE TABLE ooni.obs_web (`measurement_uid` String, `observation_idx` UInt16, `input` Nullable(String), `report_id` String, `measurement_start_time` DateTime64(3, 'UTC'), `software_name` String, `software_version` String, `test_name` String, `test_version` String, `bucket_date` String, `probe_asn` UInt32, `probe_cc` String, `probe_as_org_name` String, `probe_as_cc` String, `probe_as_name` String, `network_type` String, `platform` String, `origin` String, `engine_name` String, `engine_version` String, `architecture` String, `resolver_ip` String, `resolver_asn` UInt32, `resolver_cc` String, `resolver_as_org_name` String, `resolver_as_cc` String, `resolver_is_scrubbed` UInt8, `resolver_asn_probe` UInt32, `resolver_as_org_name_probe` String, `created_at` Nullable(DateTime('UTC')), `target_id` Nullable(String), `hostname` Nullable(String), `transaction_id` Nullable(UInt16), `ip` Nullable(String), `port` Nullable(UInt16), `ip_asn` Nullable(UInt32), `ip_as_org_name` Nullable(String), `ip_as_cc` Nullable(String), `ip_cc` Nullable(String), `ip_is_bogon` Nullable(UInt8), `dns_query_type` Nullable(String), `dns_failure` Nullable(String), `dns_engine` Nullable(String), `dns_engine_resolver_address` Nullable(String), `dns_answer_type` Nullable(String), `dns_answer` Nullable(String), `dns_answer_asn` Nullable(UInt32), `dns_answer_as_org_name` Nullable(String), `dns_t` Nullable(Float64), `tcp_failure` Nullable(String), `tcp_success` Nullable(UInt8), `tcp_t` Nullable(Float64), `tls_failure` Nullable(String), `tls_server_name` Nullable(String), `tls_version` Nullable(String), `tls_cipher_suite` Nullable(String), `tls_is_certificate_valid` Nullable(UInt8), `tls_end_entity_certificate_fingerprint` Nullable(String), `tls_end_entity_certificate_subject` Nullable(String), `tls_end_entity_certificate_subject_common_name` Nullable(String), `tls_end_entity_certificate_issuer` Nullable(String), `tls_end_entity_certificate_issuer_common_name` Nullable(String), `tls_end_entity_certificate_san_list` Array(String), `tls_end_entity_certificate_not_valid_after` Nullable(DateTime64(3, 'UTC')), `tls_end_entity_certificate_not_valid_before` Nullable(DateTime64(3, 'UTC')), `tls_certificate_chain_length` Nullable(UInt16), `tls_certificate_chain_fingerprints` Array(String), `tls_handshake_read_count` Nullable(UInt16), `tls_handshake_write_count` Nullable(UInt16), `tls_handshake_read_bytes` Nullable(UInt32), `tls_handshake_write_bytes` Nullable(UInt32), `tls_handshake_last_operation` Nullable(String), `tls_handshake_time` Nullable(Float64), `tls_t` Nullable(Float64), `http_request_url` Nullable(String), `http_network` Nullable(String), `http_alpn` Nullable(String), `http_failure` Nullable(String), `http_request_body_length` Nullable(UInt32), `http_request_method` Nullable(String), `http_runtime` Nullable(Float64), `http_response_body_length` Nullable(Int32), `http_response_body_is_truncated` Nullable(UInt8), `http_response_body_sha1` Nullable(String), `http_response_status_code` Nullable(UInt16), `http_response_header_location` Nullable(String), `http_response_header_server` Nullable(String), `http_request_redirect_from` Nullable(String), `http_request_body_is_truncated` Nullable(UInt8), `http_t` Nullable(Float64), `probe_analysis` Nullable(String)) +ENGINE = ReplicatedReplacingMergeTree( +'/clickhouse/{cluster}/tables/{database}/{table}/{shard}', +'{replica}' +) +PARTITION BY concat(substring(bucket_date, 1, 4), substring(bucket_date, 6, 2)) +PRIMARY KEY (measurement_uid, observation_idx) +ORDER BY (measurement_uid, observation_idx, measurement_start_time, probe_cc, probe_asn) SETTINGS index_granularity = 8192 +``` + +Check all the partitions that exist for the source table and produce ALTER queries to map them from the source to the destination: + +```sql +SELECT DISTINCT 'ALTER TABLE ooni.obs_web ATTACH PARTITION ID \'' || partition_id || '\' FROM obs_web_bak;' from system.parts WHERE table = 'obs_web_bak' AND active; +``` + +While you are running the following, you should stop all merges by running: + +```sql +SYSTEM STOP MERGES; +``` + +This can then be scripted like so: + +```sh +clickhouse-client -q "SELECT DISTINCT 'ALTER TABLE ooni.obs_web ATTACH PARTITION ID \'' || partition_id || '\' FROM obs_web_bak;' from system.parts WHERE table = 'obs_web_bak' format TabSeparatedRaw" | clickhouse-client -u write --password XXXX -mn +``` + +You will now have a replicated table existing on one of the replicas. + +Then you shall for each other replica in the set manually create the table, but this time pass in it explicitly the zookeeper path. + +You can get the zookeeper path by running the following on the first replica you have setup + +```sql +SELECT zookeeper_path FROM system.replicas WHERE table = 'obs_web'; +``` + +For each replica you will then have to create the tables like so: + +```sql +CREATE TABLE ooni.obs_web (`measurement_uid` String, `observation_idx` UInt16, `input` Nullable(String), `report_id` String, `measurement_start_time` DateTime64(3, 'UTC'), `software_name` String, `software_version` String, `test_name` String, `test_version` String, `bucket_date` String, `probe_asn` UInt32, `probe_cc` String, `probe_as_org_name` String, `probe_as_cc` String, `probe_as_name` String, `network_type` String, `platform` String, `origin` String, `engine_name` String, `engine_version` String, `architecture` String, `resolver_ip` String, `resolver_asn` UInt32, `resolver_cc` String, `resolver_as_org_name` String, `resolver_as_cc` String, `resolver_is_scrubbed` UInt8, `resolver_asn_probe` UInt32, `resolver_as_org_name_probe` String, `created_at` Nullable(DateTime('UTC')), `target_id` Nullable(String), `hostname` Nullable(String), `transaction_id` Nullable(UInt16), `ip` Nullable(String), `port` Nullable(UInt16), `ip_asn` Nullable(UInt32), `ip_as_org_name` Nullable(String), `ip_as_cc` Nullable(String), `ip_cc` Nullable(String), `ip_is_bogon` Nullable(UInt8), `dns_query_type` Nullable(String), `dns_failure` Nullable(String), `dns_engine` Nullable(String), `dns_engine_resolver_address` Nullable(String), `dns_answer_type` Nullable(String), `dns_answer` Nullable(String), `dns_answer_asn` Nullable(UInt32), `dns_answer_as_org_name` Nullable(String), `dns_t` Nullable(Float64), `tcp_failure` Nullable(String), `tcp_success` Nullable(UInt8), `tcp_t` Nullable(Float64), `tls_failure` Nullable(String), `tls_server_name` Nullable(String), `tls_version` Nullable(String), `tls_cipher_suite` Nullable(String), `tls_is_certificate_valid` Nullable(UInt8), `tls_end_entity_certificate_fingerprint` Nullable(String), `tls_end_entity_certificate_subject` Nullable(String), `tls_end_entity_certificate_subject_common_name` Nullable(String), `tls_end_entity_certificate_issuer` Nullable(String), `tls_end_entity_certificate_issuer_common_name` Nullable(String), `tls_end_entity_certificate_san_list` Array(String), `tls_end_entity_certificate_not_valid_after` Nullable(DateTime64(3, 'UTC')), `tls_end_entity_certificate_not_valid_before` Nullable(DateTime64(3, 'UTC')), `tls_certificate_chain_length` Nullable(UInt16), `tls_certificate_chain_fingerprints` Array(String), `tls_handshake_read_count` Nullable(UInt16), `tls_handshake_write_count` Nullable(UInt16), `tls_handshake_read_bytes` Nullable(UInt32), `tls_handshake_write_bytes` Nullable(UInt32), `tls_handshake_last_operation` Nullable(String), `tls_handshake_time` Nullable(Float64), `tls_t` Nullable(Float64), `http_request_url` Nullable(String), `http_network` Nullable(String), `http_alpn` Nullable(String), `http_failure` Nullable(String), `http_request_body_length` Nullable(UInt32), `http_request_method` Nullable(String), `http_runtime` Nullable(Float64), `http_response_body_length` Nullable(Int32), `http_response_body_is_truncated` Nullable(UInt8), `http_response_body_sha1` Nullable(String), `http_response_status_code` Nullable(UInt16), `http_response_header_location` Nullable(String), `http_response_header_server` Nullable(String), `http_request_redirect_from` Nullable(String), `http_request_body_is_truncated` Nullable(UInt8), `http_t` Nullable(Float64), `probe_analysis` Nullable(String)) +ENGINE = ReplicatedReplacingMergeTree( +'/clickhouse/oonidata_cluster/tables/ooni/obs_web/01', +'{replica}' +) +PARTITION BY concat(substring(bucket_date, 1, 4), substring(bucket_date, 6, 2)) +PRIMARY KEY (measurement_uid, observation_idx) +ORDER BY (measurement_uid, observation_idx, measurement_start_time, probe_cc, probe_asn) SETTINGS index_granularity = 8192 +``` + +You will then have to manually copy the data over to the destination replica from the source. + +The data lives inside of `/var/lib/clickhouse/data/{database_name}/{table_name}` + +Once the data has been copied over you should now have replicated the data and you can resume merges on all database by running: + +```sql +SYSTEM START MERGES; +``` + +### Creating tables on clusters + +```sql +CREATE TABLE ooni.obs_web_ctrl ON CLUSTER oonidata_cluster +(`measurement_uid` String, `observation_idx` UInt16, `input` Nullable(String), `report_id` String, `measurement_start_time` DateTime64(3, 'UTC'), `software_name` String, `software_version` String, `test_name` String, `test_version` String, `bucket_date` String, `hostname` String, `created_at` Nullable(DateTime64(3, 'UTC')), `ip` String, `port` Nullable(UInt16), `ip_asn` Nullable(UInt32), `ip_as_org_name` Nullable(String), `ip_as_cc` Nullable(String), `ip_cc` Nullable(String), `ip_is_bogon` Nullable(UInt8), `dns_failure` Nullable(String), `dns_success` Nullable(UInt8), `tcp_failure` Nullable(String), `tcp_success` Nullable(UInt8), `tls_failure` Nullable(String), `tls_success` Nullable(UInt8), `tls_server_name` Nullable(String), `http_request_url` Nullable(String), `http_failure` Nullable(String), `http_success` Nullable(UInt8), `http_response_body_length` Nullable(Int32)) +ENGINE = ReplicatedReplacingMergeTree( +'/clickhouse/{cluster}/tables/{database}/{table}/{shard}', +'{replica}' +) +PARTITION BY concat(substring(bucket_date, 1, 4), substring(bucket_date, 6, 2)) +PRIMARY KEY (measurement_uid, observation_idx) ORDER BY (measurement_uid, observation_idx, measurement_start_time, hostname) SETTINGS index_granularity = 8192 +``` diff --git a/docs/Tools.md b/docs/Tools.md new file mode 100644 index 00000000..73d9f078 --- /dev/null +++ b/docs/Tools.md @@ -0,0 +1,211 @@ + +### Geolocation script +The following script can be used to compare the geolocation reported by +the probes submitting measurements compared to the geolocation of the +`/24` subnet the probe is coming from. It is meant to be run on +[backend-fsn.ooni.org](#backend-fsn.ooni.org) 🖥. + +``` python +##!/usr/bin/env python3 + +from time import sleep + +import systemd.journal +import geoip2.database # type: ignore + +asnfn = "/var/lib/ooniapi/asn.mmdb" +ccfn = "/var/lib/ooniapi/cc.mmdb" +geoip_asn_reader = geoip2.database.Reader(asnfn) +geoip_cc_reader = geoip2.database.Reader(ccfn) + + +def follow_journal(): + journal = systemd.journal.Reader() + #journal.seek_tail() + journal.get_previous() + journal.add_match(_SYSTEMD_UNIT="nginx.service") + while True: + try: + event = journal.wait(-1) + if event == systemd.journal.APPEND: + for entry in journal: + yield entry["MESSAGE"] + except Exception as e: + print(e) + sleep(0.1) + + +def geolookup(ipaddr: str): + cc = geoip_cc_reader.country(ipaddr).country.iso_code + asn = geoip_asn_reader.asn(ipaddr).autonomous_system_number + return cc, asn + + +def process(rawmsg): + if ' "POST /report/' not in rawmsg: + return + msg = rawmsg.strip().split() + ipaddr = msg[2] + ipaddr2 = msg[3] + path = msg[8][8:] + tsamp, tn, probe_cc, probe_asn, collector, rand = path.split("_") + geo_cc, geo_asn = geolookup(ipaddr) + proxied = 0 + probe_type = rawmsg.rsplit('"', 2)[-2] + if "," in probe_type: + return + if ipaddr2 != "0.0.0.0": + proxied = 1 + # Probably CloudFront, use second ipaddr + geo_cc, geo_asn = geolookup(ipaddr2) + + print(f"{probe_cc},{geo_cc},{probe_asn},{geo_asn},{proxied},{probe_type}") + + +def main(): + for msg in follow_journal(): + if msg is None: + break + try: + process(msg) + except Exception as e: + print(e) + sleep(0.1) + + +if __name__ == "__main__": + main() +``` + + +### Test list prioritization monitoring +The following script monitors prioritized test list for changes in URLs +for a set of countries. Outputs StatsS metrics. + +> **note** +> The prioritization system has been modified to work on a granularity of +> probe_cc + probe_asn rather than whole countries. + +Country-wise changes might be misleading. The script can be modified to +filter for a set of CCs+ASNs. + +``` python +##!/usr/bin/env python3 + +from time import sleep +import urllib.request +import json + +import statsd # debdeps: python3-statsd + +metrics = statsd.StatsClient("127.0.0.1", 8125, prefix="test-list-changes") + +CCs = ["GE", "IT", "US"] +THRESH = 100 + + +def peek(cc, listmap) -> None: + url = f"https://api.ooni.io/api/v1/test-list/urls?country_code={cc}&debug=True" + res = urllib.request.urlopen(url) + j = json.load(res) + top = j["results"][:THRESH] # list of dicts + top_urls = set(d["url"] for d in top) + + if cc in listmap: + old = listmap[cc] + changed = old.symmetric_difference(top_urls) + tot_cnt = len(old.union(top_urls)) + changed_ratio = len(changed) / tot_cnt * 100 + metrics.gauge(f"-{cc}", changed_ratio) + + listmap[cc] = top_urls + + +def main() -> None: + listmap = {} + while True: + for cc in CCs: + try: + peek(cc, listmap) + except Exception as e: + print(e) + sleep(1) + sleep(60 * 10) + + +if __name__ == "__main__": + main() +``` + +### Recompressing postcans on S3 +The following script can be used to compress .tar.gz files in the S3 data bucket. +It keeps a copy of the original files locally as a backup. +It terminates once a correctly compressed file is found. +Running the script on an AWS host close to the S3 bucket can significantly +speed up the process. + +Tested with the packages: + + * python3-boto3 1.28.49+dfsg-1 + * python3-magic 2:0.4.27-2 + +Set the ACCESS_KEY and SECRET_KEY environment variables. +Update the PREFIX variable as needed. + +```python +##!/usr/bin/env python3 +from os import getenv, rename +from sys import exit +import boto3 +import gzip +import magic + +BUCKET_NAME = "ooni-data-eu-fra-test" +## BUCKET_NAME = "ooni-data-eu-fra" +PREFIX = "raw/2021" + +def fetch_files(): + s3 = boto3.client( + "s3", + aws_access_key_id=getenv("ACCESS_KEY"), + aws_secret_access_key=getenv("SECRET_KEY"), + ) + cont_token = None + while True: + kw = {} if cont_token is None else dict(ContinuationToken=cont_token) + r = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=PREFIX, **kw) + cont_token = r.get("NextContinuationToken", None) + for i in r.get("Contents", []): + k = i["Key"] + if k.endswith(".tar.gz"): + fn = k.rsplit("/", 1)[-1] + s3.download_file(BUCKET_NAME, k, fn) + yield k, fn + if cont_token is None: + return + +def main(): + s3res = session = boto3.Session( + aws_access_key_id=getenv("ACCESS_KEY"), + aws_secret_access_key=getenv("SECRET_KEY"), + ).resource("s3") + for s3key, fn in fetch_files(): + ft = magic.from_file(fn) + if "tar archive" not in ft: + print(f"found {ft} at {s3key}") + # continue # simply ignore already compressed files + exit() # stop when compressed files are found + tarfn = fn[:-3] + rename(fn, tarfn) # keep the local file as a backup + with open(tarfn, "rb") as f: + inp = f.read() + comp = gzip.compress(inp, compresslevel=9) + ratio = len(inp) / len(comp) + del inp + print(f"uploading {s3key} compression ratio {ratio}") + obj = s3res.Object(BUCKET_NAME, s3key) + obj.put(Body=comp) + del comp + +main() +``` diff --git a/scripts/build-docs.sh b/scripts/build-docs.sh index 0d049166..7bc606da 100755 --- a/scripts/build-docs.sh +++ b/scripts/build-docs.sh @@ -1,6 +1,7 @@ #!/bin/bash DOCS_ROOT=dist/docs/ REPO_NAME="ooni/devops" +MAIN_BRANCH="main" COMMIT_HASH=$(git rev-parse --short HEAD) mkdir -p $DOCS_ROOT @@ -12,38 +13,36 @@ strip_title() { cat $infile | awk 'BEGIN{p=1} /^#/{if(p){p=0; next}} {print}' } -cat <$DOCS_ROOT/00-index.md ---- -# Do not edit! This file is automatically generated -# to edit go to: https://github.com/$REPO_NAME/edit/main/README.md -# version: $REPO_NAME:$COMMIT_HASH -title: OONI Devops -description: OONI Devops -slug: devops ---- -EOF -strip_title README.md >> $DOCS_ROOT/00-index.md +generate_doc() { + local order="$1" + local slug="$2" + local input_file="$3" + local output_file="$4" + local title="$5" + local description="$6" -cat <$DOCS_ROOT/01-iac.md + cat <"$DOCS_ROOT/$output_file" --- # Do not edit! This file is automatically generated -# to edit go to: https://github.com/$REPO_NAME/edit/main/tf/README.md -# version: $REPO_NAME:$COMMIT_HASH -title: OONI Devops IaC -description: OONI Devops IaC Documentation -slug: devops/iac +# version: $REPO_NAME/$input_file:$COMMIT_HASH +title: $title +description: $description +slug: $slug +sidebar: + order: $order --- EOF -strip_title tf/README.md >> $DOCS_ROOT/01-iac.md + echo "[edit file](https://github.com/$REPO_NAME/edit/$MAIN_BRANCH/$input_file)" >> "$DOCS_ROOT/$output_file" + strip_title "$input_file" >> "$DOCS_ROOT/$output_file" +} -cat <$DOCS_ROOT/02-configuration-management.md ---- -# Do not edit! This file is automatically generated -# to edit go to: https://github.com/$REPO_NAME/edit/main/ansible/README.md -# version: $REPO_NAME:$COMMIT_HASH -title: OONI Devops Configuration Management -description: OONI Devops Configuration Management Documentation -slug: devops/configuration-management ---- -EOF -strip_title ansible/README.md >> $DOCS_ROOT/02-configuration-management.md \ No newline at end of file +generate_doc 0 "devops" "README.md" "00-index.md" "OONI Devops" "OONI OONI Devops" +generate_doc 1 "devops/infrastructure" "docs/Infrastructure.md" "01-infrastructure.md" "Infrastructure" "Infrastructure documentation" +generate_doc 2 "devops/monitoring" "docs/MonitoringAlerts.md" "02-monitoring-alerts.md" "Monitoring" "Monitoring and Alerts documentation" +generate_doc 3 "docs/Runbooks.md" "03-runbooks.md" "Runbooks" "Runbooks docs" "devops/runbooks" +generate_doc 4 "devops/incident-response" "docs/IncidentResponse.md" "04-incident-response.md" "Incident response" "Incident response handling guidelines" +generate_doc 5 "devops/terraform" "tf/README.md" "05-terraform.md" "Terraform setup" "Terraform setup" +generate_doc 6 "devops/ansible" "ansible/README.md" "06-ansible.md" "Ansible setup" "Ansible setup" +generate_doc 7 "devops/tools" "docs/Tools.md" "07-tools.md" "Misc Tools" "Misc Tools" +generate_doc 8 "devops/debian-packages" "docs/DebianPackages.md" "08-debian-packages.md" "Debian Packages" "Debian Packages" +generate_doc 9 "devops/legacy-docs" "docs/LegacyDocs.md" "09-legacy-docs.md" "Legacy Documentation" "Legacy Documentation" diff --git a/scripts/cluster-migration/benchmark.sql b/scripts/cluster-migration/benchmark.sql new file mode 100644 index 00000000..55e06781 --- /dev/null +++ b/scripts/cluster-migration/benchmark.sql @@ -0,0 +1,55 @@ +SELECT + countIf ( + anomaly = 't' + AND confirmed = 'f' + AND msm_failure = 'f' + ) AS anomaly_count, + countIf ( + confirmed = 't' + AND msm_failure = 'f' + ) AS confirmed_count, + countIf (msm_failure = 't') AS failure_count, + countIf ( + anomaly = 'f' + AND confirmed = 'f' + AND msm_failure = 'f' + ) AS ok_count, + COUNT(*) AS measurement_count, + domain +FROM + fastpath +WHERE + measurement_start_time >= '2024-11-01' + AND measurement_start_time < '2024-11-10' + AND probe_cc = 'IT' +GROUP BY + domain; + +SELECT + COUNT(*) AS measurement_count, + domain +FROM + analysis_web_measurement +WHERE + measurement_start_time >= '2024-11-01' + AND measurement_start_time < '2024-11-10' + AND probe_cc = 'IT' +GROUP BY + domain; + +ALTER TABLE ooni.analysis_web_measurement ON CLUSTER oonidata_cluster MODIFY +ORDER BY + ( + measurement_start_time, + probe_cc, + probe_asn, + domain, + measurement_uid + ) +ALTER TABLE ooni.analysis_web_measurement ON CLUSTER oonidata_cluster ADD INDEX IF NOT EXISTS measurement_start_time_idx measurement_start_time TYPE minmax GRANULARITY 2; + +ALTER TABLE ooni.analysis_web_measurement ON CLUSTER oonidata_cluster MATERIALIZE INDEX measurement_start_time_idx; + +ALTER TABLE ooni.analysis_web_measurement ON CLUSTER oonidata_cluster ADD INDEX IF NOT EXISTS probe_cc_idx probe_cc TYPE minmax GRANULARITY 1; + +ALTER TABLE ooni.analysis_web_measurement ON CLUSTER oonidata_cluster MATERIALIZE INDEX probe_cc_idx; \ No newline at end of file diff --git a/scripts/cluster-migration/db-sample.py b/scripts/cluster-migration/db-sample.py new file mode 100644 index 00000000..d4544135 --- /dev/null +++ b/scripts/cluster-migration/db-sample.py @@ -0,0 +1,33 @@ +from datetime import datetime, timedelta +import csv + +from tqdm import tqdm +from clickhouse_driver import Client as ClickhouseClient + + +START_TIME = datetime(2024, 11, 1, 0, 0, 0) +END_TIME = datetime(2024, 11, 10, 0, 0, 0) +SAMPLE_SIZE = 100 + + +def sample_to_file(table_name): + with ClickhouseClient.from_url("clickhouse://localhost/ooni") as click, open( + f"{table_name}-sample.csv", "w" + ) as out_file: + writer = csv.writer(out_file) + ts = START_TIME + while ts < END_TIME: + for row in click.execute_iter( + f""" + SELECT * FROM {table_name} + WHERE measurement_uid LIKE '{ts.strftime("%Y%m%d%H")}%' + ORDER BY measurement_uid LIMIT {SAMPLE_SIZE} + """ + ): + writer.writerow(row) + ts += timedelta(hours=1) + + +if __name__ == "__main__": + sample_to_file("obs_web") + sample_to_file("analysis_web_measurement") diff --git a/scripts/cluster-migration/migrate-tables.py b/scripts/cluster-migration/migrate-tables.py new file mode 100644 index 00000000..2a3d4bfb --- /dev/null +++ b/scripts/cluster-migration/migrate-tables.py @@ -0,0 +1,38 @@ +import os + +from tqdm import tqdm +from clickhouse_driver import Client as ClickhouseClient + + +WRITE_CLICKHOUSE_URL = os.environ["WRITE_CLICKHOUSE_URL"] + + +def stream_table(table_name, where_clause): + with ClickhouseClient.from_url("clickhouse://backend-fsn.ooni.org/") as click: + for row in click.execute_iter(f"SELECT * FROM {table_name} {where_clause}"): + yield row + + +def copy_table(table_name, where_clause): + with ClickhouseClient.from_url(WRITE_CLICKHOUSE_URL) as click_writer: + buf = [] + for row in tqdm(stream_table(table_name=table_name, where_clause=where_clause)): + buf.append(row) + if len(buf) > 50_000: + click_writer.execute(f"INSERT INTO {table_name} VALUES", buf) + buf = [] + + if len(buf) > 0: + click_writer.execute(f"INSERT INTO {table_name} VALUES", buf) + + +if __name__ == "__main__": + assert WRITE_CLICKHOUSE_URL, "WRITE_CLICKHOUSE_URL environment variable is not set" + print("## copying `fastpath` table") + copy_table("fastpath", "WHERE measurement_uid < '20241127'") + print("## copying `jsonl` table") + copy_table("jsonl", "WHERE measurement_uid < '20241127'") + print("## copying `citizenlab` table") + copy_table("citizenlab", "") + print("## copying `citizenlab_flip` table") + copy_table("citizenlab_flip", "") diff --git a/scripts/cluster-migration/schema.sql b/scripts/cluster-migration/schema.sql new file mode 100644 index 00000000..7588f060 --- /dev/null +++ b/scripts/cluster-migration/schema.sql @@ -0,0 +1,137 @@ +CREATE TABLE + ooni.jsonl ON CLUSTER oonidata_cluster ( + `report_id` String, + `input` String, + `s3path` String, + `linenum` Int32, + `measurement_uid` String, + `date` Date, + `source` String, + `update_time` DateTime64 (3) MATERIALIZED now64 () + ) ENGINE = ReplicatedReplacingMergeTree ( + '/clickhouse/{cluster}/tables/ooni/jsonl/{shard}', + '{replica}', + update_time + ) +ORDER BY + (report_id, input, measurement_uid) SETTINGS index_granularity = 8192; + +CREATE TABLE + ooni.fastpath ON CLUSTER oonidata_cluster ( + `measurement_uid` String, + `report_id` String, + `input` String, + `probe_cc` LowCardinality (String), + `probe_asn` Int32, + `test_name` LowCardinality (String), + `test_start_time` DateTime, + `measurement_start_time` DateTime, + `filename` String, + `scores` String, + `platform` String, + `anomaly` String, + `confirmed` String, + `msm_failure` String, + `domain` String, + `software_name` String, + `software_version` String, + `control_failure` String, + `blocking_general` Float32, + `is_ssl_expected` Int8, + `page_len` Int32, + `page_len_ratio` Float32, + `server_cc` String, + `server_asn` Int8, + `server_as_name` String, + `update_time` DateTime64 (3) MATERIALIZED now64 (), + `test_version` String, + `architecture` String, + `engine_name` LowCardinality (String), + `engine_version` String, + `test_runtime` Float32, + `blocking_type` String, + `test_helper_address` LowCardinality (String), + `test_helper_type` LowCardinality (String), + `ooni_run_link_id` Nullable (UInt64), + INDEX fastpath_rid_idx report_id TYPE minmax GRANULARITY 1, + INDEX measurement_uid_idx measurement_uid TYPE minmax GRANULARITY 8 + ) ENGINE = ReplicatedReplacingMergeTree ( + '/clickhouse/{cluster}/tables/ooni/fastpath/{shard}', + '{replica}', + update_time + ) +ORDER BY + ( + measurement_start_time, + report_id, + input, + measurement_uid + ) SETTINGS index_granularity = 8192; + +CREATE TABLE + ooni.citizenlab ON CLUSTER oonidata_cluster ( + `domain` String, + `url` String, + `cc` FixedString (32), + `category_code` String + ) ENGINE = ReplicatedReplacingMergeTree ( + '/clickhouse/{cluster}/tables/ooni/citizenlab/{shard}', + '{replica}' + ) +ORDER BY + (domain, url, cc, category_code) SETTINGS index_granularity = 4; + +CREATE TABLE + ooni.citizenlab_flip ON CLUSTER oonidata_cluster ( + `domain` String, + `url` String, + `cc` FixedString (32), + `category_code` String + ) ENGINE = ReplicatedReplacingMergeTree ( + '/clickhouse/{cluster}/tables/ooni/citizenlab_flip/{shard}', + '{replica}' + ) +ORDER BY + (domain, url, cc, category_code) SETTINGS index_granularity = 4; + +CREATE TABLE + analysis_web_measurement ON CLUSTER oonidata_cluster ( + `domain` String, + `input` String, + `test_name` String, + `probe_asn` UInt32, + `probe_as_org_name` String, + `probe_cc` String, + `resolver_asn` UInt32, + `resolver_as_cc` String, + `network_type` String, + `measurement_start_time` DateTime64 (3, 'UTC'), + `measurement_uid` String, + `ooni_run_link_id` String, + `top_probe_analysis` Nullable (String), + `top_dns_failure` Nullable (String), + `top_tcp_failure` Nullable (String), + `top_tls_failure` Nullable (String), + `dns_blocked` Float32, + `dns_down` Float32, + `dns_ok` Float32, + `tcp_blocked` Float32, + `tcp_down` Float32, + `tcp_ok` Float32, + `tls_blocked` Float32, + `tls_down` Float32, + `tls_ok` Float32 + ) ENGINE = ReplicatedReplacingMergeTree ( + '/clickhouse/{cluster}/tables/ooni/analysis_web_measurement/{shard}', + '{replica}' + ) +PARTITION BY + substring(measurement_uid, 1, 6) PRIMARY KEY measurement_uid +ORDER BY + ( + measurement_uid, + measurement_start_time, + probe_cc, + probe_asn, + domain + ) SETTINGS index_granularity = 8192; \ No newline at end of file diff --git a/tf/README.md b/tf/README.md index 8f1ee41a..b9ae6785 100644 --- a/tf/README.md +++ b/tf/README.md @@ -9,9 +9,14 @@ Terraform is used for managing the OONI infrastructure as code. ``` [oonidevops_user] -aws_access_key_id = XXXX -aws_secret_access_key = YYYY -role_arn = arn:aws:iam::OONI_ORG_ID:role/oonidevops +aws_access_key_id = YYYY +aws_secret_access_key = ZZZ +[oonidevops_user_dev] +role_arn = arn:aws:iam::905418398257:role/oonidevops +source_profile = oonidevops_user +[oonidevops_user_prod] +role_arn = arn:aws:iam::471112720364:role/oonidevops +source_profile = oonidevops_user ``` Where you replace OONI_ORG_ID with the ID of the ORG you are deploying to (dev, diff --git a/tf/environments/dev/main.tf b/tf/environments/dev/main.tf index 7809b94b..47e3ccd1 100644 --- a/tf/environments/dev/main.tf +++ b/tf/environments/dev/main.tf @@ -34,10 +34,13 @@ provider "aws" { # source_profile = oonidevops_user } -# In order for this provider to work you have to set the following environment -# variable to your DigitalOcean API token: -# DIGITALOCEAN_ACCESS_TOKEN= -provider "digitalocean" {} +data "aws_ssm_parameter" "do_token" { + name = "/oonidevops/secrets/digitalocean_access_token" +} + +provider "digitalocean" { + token = data.aws_ssm_parameter.do_token.value +} data "aws_availability_zones" "available" {} @@ -72,10 +75,10 @@ module "adm_iam_roles" { source = "../../modules/adm_iam_roles" authorized_accounts = [ + "arn:aws:iam::${local.ooni_main_org_id}:user/art", "arn:aws:iam::${local.ooni_main_org_id}:user/mehul", - "arn:aws:iam::${local.ooni_dev_org_id}:user/mehul", - "arn:aws:iam::${local.ooni_dev_org_id}:user/art", - "arn:aws:iam::${local.ooni_main_org_id}:user/art" + "arn:aws:iam::${local.ooni_main_org_id}:user/luis", + "arn:aws:iam::${local.ooni_main_org_id}:user/tony" ] } @@ -226,6 +229,10 @@ resource "aws_secretsmanager_secret_version" "oonipg_url" { ) } +data "aws_ssm_parameter" "clickhouse_readonly_url" { + name = "/oonidevops/secrets/clickhouse_readonly_url" +} + resource "random_id" "artifact_id" { byte_length = 4 } @@ -277,31 +284,6 @@ module "ooni_th_droplet" { dns_zone_ooni_io = local.dns_zone_ooni_io } -module "ooni_backendproxy" { - source = "../../modules/ooni_backendproxy" - - stage = local.environment - - vpc_id = module.network.vpc_id - subnet_id = module.network.vpc_subnet_public[0].id - private_subnet_cidr = module.network.vpc_subnet_private[*].cidr_block - dns_zone_ooni_io = local.dns_zone_ooni_io - - key_name = module.adm_iam_roles.oonidevops_key_name - instance_type = "t2.micro" - - backend_url = "https://backend-hel.ooni.org/" - wcth_addresses = module.ooni_th_droplet.droplet_ipv4_address - wcth_domain_suffix = "th.dev.ooni.io" - clickhouse_url = "backend-hel.ooni.org" - clickhouse_port = "9000" - - tags = merge( - local.tags, - { Name = "ooni-tier0-backendproxy" } - ) -} - ### OONI Services Clusters module "ooniapi_cluster" { @@ -314,9 +296,9 @@ module "ooniapi_cluster" { asg_min = 2 asg_max = 6 - asg_desired = 3 + asg_desired = 2 - instance_type = "t3.micro" + instance_type = "t3a.micro" tags = merge( local.tags, @@ -346,8 +328,7 @@ module "ooniapi_ooniprobe_deployer" { module "ooniapi_ooniprobe" { source = "../../modules/ooniapi_service" - task_cpu = 256 - task_memory = 512 + task_memory = 64 # First run should be set on first run to bootstrap the task definition # first_run = true @@ -379,44 +360,48 @@ module "ooniapi_ooniprobe" { ) } -#### OONI Measurements service +#### OONI Backend proxy service -module "ooniapi_oonimeasurements_deployer" { +module "ooniapi_reverseproxy_deployer" { source = "../../modules/ooniapi_service_deployer" - service_name = "oonimeasurements" + service_name = "reverseproxy" repo = "ooni/backend" branch_name = "master" - buildspec_path = "ooniapi/services/oonimeasurements/buildspec.yml" + buildspec_path = "ooniapi/services/reverseproxy/buildspec.yml" codestar_connection_arn = aws_codestarconnections_connection.oonidevops.arn codepipeline_bucket = aws_s3_bucket.ooniapi_codepipeline_bucket.bucket - ecs_service_name = module.ooniapi_oonimeasurements.ecs_service_name + ecs_service_name = module.ooniapi_reverseproxy.ecs_service_name ecs_cluster_name = module.ooniapi_cluster.cluster_name } -module "ooniapi_oonimeasurements" { +module "ooniapi_reverseproxy" { source = "../../modules/ooniapi_service" + task_memory = 64 + + # First run should be set on first run to bootstrap the task definition + # first_run = true + vpc_id = module.network.vpc_id public_subnet_ids = module.network.vpc_subnet_public[*].id private_subnet_ids = module.network.vpc_subnet_private[*].id - service_name = "oonimeasurements" - default_docker_image_url = "ooni/api-oonimeasurements:latest" + service_name = "reverseproxy" + default_docker_image_url = "ooni/api-reverseproxy:latest" stage = local.environment dns_zone_ooni_io = local.dns_zone_ooni_io key_name = module.adm_iam_roles.oonidevops_key_name ecs_cluster_id = module.ooniapi_cluster.cluster_id task_secrets = { - JWT_ENCRYPTION_KEY = aws_secretsmanager_secret_version.jwt_secret.arn PROMETHEUS_METRICS_PASSWORD = aws_secretsmanager_secret_version.prometheus_metrics_password.arn } task_environment = { - CLICKHOUSE_URL = "backend-hel.ooni.org" + TARGET_URL = "https://backend-hel.ooni.org/" } ooniapi_service_security_groups = [ @@ -425,10 +410,37 @@ module "ooniapi_oonimeasurements" { tags = merge( local.tags, - { Name = "ooni-tier0-oonimeasurements" } + { Name = "ooni-tier0-reverseproxy" } + ) +} + +module "ooni_backendproxy" { + source = "../../modules/ooni_backendproxy" + + stage = local.environment + + vpc_id = module.network.vpc_id + subnet_id = module.network.vpc_subnet_public[0].id + private_subnet_cidr = module.network.vpc_subnet_private[*].cidr_block + dns_zone_ooni_io = local.dns_zone_ooni_io + + key_name = module.adm_iam_roles.oonidevops_key_name + instance_type = "t3a.nano" + + backend_url = "https://backend-fsn.ooni.org/" + wcth_addresses = module.ooni_th_droplet.droplet_ipv4_address + wcth_domain_suffix = "th.ooni.org" + clickhouse_url = "clickhouse1.prod.ooni.io" + clickhouse_port = "9000" + + tags = merge( + local.tags, + { Name = "ooni-tier0-backendproxy" } ) } + + #### OONI Run service module "ooniapi_oonirun_deployer" { @@ -449,8 +461,7 @@ module "ooniapi_oonirun_deployer" { module "ooniapi_oonirun" { source = "../../modules/ooniapi_service" - task_cpu = 256 - task_memory = 512 + task_memory = 64 vpc_id = module.network.vpc_id public_subnet_ids = module.network.vpc_subnet_public[*].id @@ -487,7 +498,7 @@ module "ooniapi_oonifindings_deployer" { service_name = "oonifindings" repo = "ooni/backend" - branch_name = "master" + branch_name = "oonidata" buildspec_path = "ooniapi/services/oonifindings/buildspec.yml" codestar_connection_arn = aws_codestarconnections_connection.oonidevops.arn @@ -500,8 +511,7 @@ module "ooniapi_oonifindings_deployer" { module "ooniapi_oonifindings" { source = "../../modules/ooniapi_service" - task_cpu = 256 - task_memory = 512 + task_memory = 64 vpc_id = module.network.vpc_id public_subnet_ids = module.network.vpc_subnet_public[*].id @@ -518,6 +528,7 @@ module "ooniapi_oonifindings" { POSTGRESQL_URL = aws_secretsmanager_secret_version.oonipg_url.arn JWT_ENCRYPTION_KEY = aws_secretsmanager_secret_version.jwt_secret.arn PROMETHEUS_METRICS_PASSWORD = aws_secretsmanager_secret_version.prometheus_metrics_password.arn + CLICKHOUSE_URL = data.aws_ssm_parameter.clickhouse_readonly_url.arn } ooniapi_service_security_groups = [ @@ -551,8 +562,7 @@ module "ooniapi_ooniauth_deployer" { module "ooniapi_ooniauth" { source = "../../modules/ooniapi_service" - task_cpu = 256 - task_memory = 512 + task_memory = 64 vpc_id = module.network.vpc_id public_subnet_ids = module.network.vpc_subnet_public[*].id @@ -608,7 +618,7 @@ module "ooniapi_frontend" { vpc_id = module.network.vpc_id subnet_ids = module.network.vpc_subnet_public[*].id - oonibackend_proxy_target_group_arn = module.ooni_backendproxy.alb_target_group_id + oonibackend_proxy_target_group_arn = module.ooniapi_reverseproxy.alb_target_group_id ooniapi_oonirun_target_group_arn = module.ooniapi_oonirun.alb_target_group_id ooniapi_ooniauth_target_group_arn = module.ooniapi_ooniauth.alb_target_group_id ooniapi_ooniprobe_target_group_arn = module.ooniapi_ooniprobe.alb_target_group_id @@ -645,7 +655,7 @@ locals { } resource "aws_route53_record" "ooniapi_frontend_main" { - name = local.ooniapi_frontend_main_domain_name + name = local.ooniapi_frontend_main_domain_name zone_id = local.ooniapi_frontend_main_domain_name_zone_id type = "A" @@ -701,4 +711,4 @@ resource "aws_route53_record" "ooniapi_frontend_cert_validation" { resource "aws_acm_certificate_validation" "ooniapi_frontend" { certificate_arn = aws_acm_certificate.ooniapi_frontend.arn validation_record_fqdns = [for record in aws_route53_record.ooniapi_frontend_cert_validation : record.fqdn] -} \ No newline at end of file +} diff --git a/tf/environments/prod/.terraform.lock.hcl b/tf/environments/prod/.terraform.lock.hcl index 6f3c4ce4..d11e04d7 100644 --- a/tf/environments/prod/.terraform.lock.hcl +++ b/tf/environments/prod/.terraform.lock.hcl @@ -2,49 +2,49 @@ # Manual edits may be lost in future updates. provider "registry.terraform.io/digitalocean/digitalocean" { - version = "2.41.0" + version = "2.43.0" constraints = "~> 2.0" hashes = [ - "h1:Ne6nxvygwwHbNEO9My9uukE/YtlwAVMr/Bud1FIc6uc=", - "zh:13bfbca765a302a8fdf9ca0e4c5d25c7ee62d21b2bc7fbc241e298215c78e5f7", - "zh:45ef1602bb56fde0b6755f99847da0549144ebdd4af2da695e44d1a06d24d685", - "zh:4a6d81c462a11e710dd6138bb18573f60af456e83c5af0c1158578b4dc8e07f9", - "zh:5827b9463f7fce29bf4d9eb9264771d3aec103ed25e2151e570e8bee27b2dc6a", - "zh:639e59ffddb267a5255d66b93c816b713df96a304c23757364a96a65159ee177", - "zh:6876c162f2e4f850c4acede81857c72665710af2f552f19b1de56bcd5addc86a", - "zh:6a23b529309d6e8f59339d9572504e08f5c90491dfa0d1b1468a6fd7bd6b1b3d", - "zh:7d6e2c103f097a694b81d0e22ecd24ec2778a307e64dbef8de4f956d53219274", - "zh:8203577b5ad891e84afa994a47c6aba85401edf4bdd5aaf7f5e30e59e1393880", - "zh:88672feeae8ac9f4f99391b99957426c9c0a667021c658c4c9dad23abd5b5832", - "zh:ae3703123073a7808cea5a7a89289973e58a4fd83e94680091d4a8420ad521f5", - "zh:b59dd8675402e49a1fba5d2cf14596553c21f104bbb90a1167aa44c39693e7a5", - "zh:bb608cf1db63f985709e0052dbc3d16e9c801a23ebbf4d0a687c8a89d09e3769", - "zh:f1164e25518c00a640a8a375b2214d9bfc86297d2d726a6d35ed6d5de334ef96", - "zh:fc8a0a0375b26095e78ecfd987b79e6ef26c9c5d2e4393d437a9601ea1f3c5c5", - "zh:ffae2daa3ef366047885ace62f2fd0d126d6581d253996ef78c11bc5acbb3999", + "h1:NFD+iFS14S3EILq2ZJ8bHaQGetYEAnETqEjkhl52eiI=", + "zh:0023fa4ca4304e9141357df9dafff3bdb33f0189d0c8544f8b872070660ccb0e", + "zh:4004c3034197ca6a2d719d26125eb21e01e652dc77932e27fd0c60151d7ca6d1", + "zh:44173e57c086cad3177bb6c2063981fb9f4ac2d5f7fd9a9e1891b8c16a00d0d9", + "zh:4622261e108f8539102ce84894e03afcf9f70c796eee0ddced02c235a15d9460", + "zh:4fd86a35073061746c5b7dc693fb2a44793a15b49791edcbf0dbefef1d3dae0c", + "zh:5e00b0d847ce0f1e2f269ae55e1f9ea9ea76efb0f40af9ad43c61f89dd84a6d6", + "zh:815c30ce11020e18dd05462f22038764c4200c61a27313e67343dc66ebdcf12c", + "zh:901be1ee215935e0a459b9cb91699757e442355e5dd625637481e1d33cc0498c", + "zh:9bd04a076c175d2b90ab69cd03753e5e0ac3bab96ee6bfcaba83dcd29c829135", + "zh:9d03d25e7e30a2da6f6c2b7f46f6d21a33d55ee80209c21361b57baf7f3dd3f3", + "zh:b1f6ac1c4296e4e0e84b6955661058b04c812d72292d8f3af0b93327b59d0e6b", + "zh:c1cabafc7f1b836a56d62aa43b7d5b77faeb6d685490825f90b776c6852e9ffd", + "zh:ddfcf6ef57b99193f0dde25796cc8ad96a04dcb940eccd137e9a4d5f50c21d17", + "zh:e93dffb991e7ad7c8a0800bd6c7a692225f87656a8b73d7f0e8489a0635ea8ce", + "zh:f2137db6bd5a10662fe23c779c05d312eb71f6df5aa8d5f1e6a45b4c0404b2a0", + "zh:f5e494414b35293f830ffc741e4915744fa84400810dcbcb7df9920a4dadc56d", ] } provider "registry.terraform.io/hashicorp/aws" { - version = "5.44.0" + version = "5.75.0" constraints = ">= 4.9.0, >= 4.66.1" hashes = [ - "h1:K3sX+P4wofRNcVsnYW4PIhxHijd3w/ZD5AO7yWFPT6A=", - "zh:1224a42bb04574785549b89815d98bda11f6e9992352fc6c36c5622f3aea91c0", - "zh:2a8d1095a2f1ab097f516d9e7e0d289337849eebb3fcc34f075070c65063f4fa", - "zh:46cce11150eb4934196d9bff693b72d0494c85917ceb3c2914d5ff4a785af861", - "zh:4a7c15d585ee747d17f4b3904851cd95cfbb920fa197aed3df78e8d7ef9609b6", - "zh:508f1a85a0b0f93bf26341207d809bd55b60c8fdeede40097d91f30111fc6f5d", - "zh:52f968ffc21240213110378d0ffb298cbd23e9157a6d01dfac5a4360492d69c2", - "zh:5e9846b48ef03eb59541049e81b15cae8bc7696a3779ae4a5412fdce60bb24e0", - "zh:850398aecaf7dc0231fc320fdd6dffe41836e07a54c8c7b40eb28e7525d3c0a9", - "zh:8f87eeb05bdd1b873b6cfb3898dfad6402ac180dfa3c8f9754df8f85dcf92ca6", + "h1:1R08bG9RT1qWHU6K0B992s3VbTIdb7cWt421+TBVS/8=", + "zh:01b01b132b70df918f735898f1ad012ab3033d1b909b2e38950d16964d94c084", + "zh:28bc6ee7b0c88b1a48f315509ad390fb1e8f39bebe0f7a43c22b1a63825251d1", + "zh:31f9043a4c3538883ab9b9d3b399dae62e4552251e6a2b1da13ec3a2018a027d", + "zh:47451c295ffbddd19679a41d728f0942486d6de0d9206418d9593dda5a20c120", + "zh:5204c1a9f41dcc10e38879d41d95d95fdbb10527f613c129603137b1dbe99777", + "zh:64c3165a6019045782c8ad2a40d6fa4253d44dba67a5a971a81791cff5a9d3d5", "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", - "zh:c726b87cd6ed111536f875dccedecff21abc802a4087264515ffab113cac36dc", - "zh:d57ea706d2f98b93c7b05b0c6bc3420de8e8cf2d0b6703085dc15ed239b2cc49", - "zh:d5d1a21246e68c2a7a04c5619eb0ad5a81644f644c432cb690537b816a156de2", - "zh:e869904cac41114b7e4ee66bcd2ce4585ed15ca842040a60cb47119f69472c91", - "zh:f1a09f2f3ea72cbe795b865cf31ad9b1866a536a8050cf0bb93d3fa51069582e", + "zh:a5788f78da2f0ac78f99ca2a4c489c041654bec992f3183fd0b972e0554f91e9", + "zh:aed486e3b24e9f82543bf558b2a7eade4a905608060fac1284145c00ff63d3e2", + "zh:b42523c409940a9c3866f4973c8251b96e5f3a0934230849c533a04b95854965", + "zh:b570353eeb97b3ed1b423a6f67857a7a3c1c47c9907e45a81c3df186a2fd88d0", + "zh:bf05df84199cbc776a878f920f6be4d27737f2de204f80794e6a652d49692f0d", + "zh:c27133287d20620244de95f4c2438135e60c057e0891a3ec97539c990f7ebdec", + "zh:c59143082fe8e4f5d5b0676472b8b0e24c2a2f1ede622a64f9f24639382d4b03", + "zh:ebe01c3b7a85deebc10b4081097dd6e8b4c79b7c13a20acb099bd17ff06afcb7", ] } @@ -68,98 +68,98 @@ provider "registry.terraform.io/hashicorp/cloudinit" { } provider "registry.terraform.io/hashicorp/local" { - version = "2.5.1" + version = "2.5.2" constraints = ">= 2.0.0" hashes = [ - "h1:/GAVA/xheGQcbOZEq0qxANOg+KVLCA7Wv8qluxhTjhU=", - "zh:0af29ce2b7b5712319bf6424cb58d13b852bf9a777011a545fac99c7fdcdf561", - "zh:126063ea0d79dad1f68fa4e4d556793c0108ce278034f101d1dbbb2463924561", - "zh:196bfb49086f22fd4db46033e01655b0e5e036a5582d250412cc690fa7995de5", - "zh:37c92ec084d059d37d6cffdb683ccf68e3a5f8d2eb69dd73c8e43ad003ef8d24", - "zh:4269f01a98513651ad66763c16b268f4c2da76cc892ccfd54b401fff6cc11667", - "zh:51904350b9c728f963eef0c28f1d43e73d010333133eb7f30999a8fb6a0cc3d8", - "zh:73a66611359b83d0c3fcba2984610273f7954002febb8a57242bbb86d967b635", + "h1:IyFbOIO6mhikFNL/2h1iZJ6kyN3U00jgkpCLUCThAfE=", + "zh:136299545178ce281c56f36965bf91c35407c11897f7082b3b983d86cb79b511", + "zh:3b4486858aa9cb8163378722b642c57c529b6c64bfbfc9461d940a84cd66ebea", + "zh:4855ee628ead847741aa4f4fc9bed50cfdbf197f2912775dd9fe7bc43fa077c0", + "zh:4b8cd2583d1edcac4011caafe8afb7a95e8110a607a1d5fb87d921178074a69b", + "zh:52084ddaff8c8cd3f9e7bcb7ce4dc1eab00602912c96da43c29b4762dc376038", + "zh:71562d330d3f92d79b2952ffdda0dad167e952e46200c767dd30c6af8d7c0ed3", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:7ae387993a92bcc379063229b3cce8af7eaf082dd9306598fcd42352994d2de0", - "zh:9e0f365f807b088646db6e4a8d4b188129d9ebdbcf2568c8ab33bddd1b82c867", - "zh:b5263acbd8ae51c9cbffa79743fbcadcb7908057c87eb22fd9048268056efbc4", - "zh:dfcd88ac5f13c0d04e24be00b686d069b4879cc4add1b7b1a8ae545783d97520", + "zh:805f81ade06ff68fa8b908d31892eaed5c180ae031c77ad35f82cb7a74b97cf4", + "zh:8b6b3ebeaaa8e38dd04e56996abe80db9be6f4c1df75ac3cccc77642899bd464", + "zh:ad07750576b99248037b897de71113cc19b1a8d0bc235eb99173cc83d0de3b1b", + "zh:b9f1c3bfadb74068f5c205292badb0661e17ac05eb23bfe8bd809691e4583d0e", + "zh:cc4cbcd67414fefb111c1bf7ab0bc4beb8c0b553d01719ad17de9a047adff4d1", ] } provider "registry.terraform.io/hashicorp/null" { - version = "3.2.2" + version = "3.2.3" hashes = [ - "h1:IMVAUHKoydFrlPrl9OzasDnw/8ntZFerCC9iXw1rXQY=", - "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", - "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", - "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", - "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", - "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", - "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", - "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", + "h1:I0Um8UkrMUb81Fxq/dxbr3HLP2cecTH2WMJiwKSrwQY=", + "zh:22d062e5278d872fe7aed834f5577ba0a5afe34a3bdac2b81f828d8d3e6706d2", + "zh:23dead00493ad863729495dc212fd6c29b8293e707b055ce5ba21ee453ce552d", + "zh:28299accf21763ca1ca144d8f660688d7c2ad0b105b7202554ca60b02a3856d3", + "zh:55c9e8a9ac25a7652df8c51a8a9a422bd67d784061b1de2dc9fe6c3cb4e77f2f", + "zh:756586535d11698a216291c06b9ed8a5cc6a4ec43eee1ee09ecd5c6a9e297ac1", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", - "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", - "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", - "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", + "zh:9d5eea62fdb587eeb96a8c4d782459f4e6b73baeece4d04b4a40e44faaee9301", + "zh:a6355f596a3fb8fc85c2fb054ab14e722991533f87f928e7169a486462c74670", + "zh:b5a65a789cff4ada58a5baffc76cb9767dc26ec6b45c00d2ec8b1b027f6db4ed", + "zh:db5ab669cf11d0e9f81dc380a6fdfcac437aea3d69109c7aef1a5426639d2d65", + "zh:de655d251c470197bcbb5ac45d289595295acb8f829f6c781d4a75c8c8b7c7dd", + "zh:f5c68199f2e6076bce92a12230434782bf768103a427e9bb9abee99b116af7b5", ] } provider "registry.terraform.io/hashicorp/random" { - version = "3.6.0" + version = "3.6.3" hashes = [ - "h1:I8MBeauYA8J8yheLJ8oSMWqB0kovn16dF/wKZ1QTdkk=", - "zh:03360ed3ecd31e8c5dac9c95fe0858be50f3e9a0d0c654b5e504109c2159287d", - "zh:1c67ac51254ba2a2bb53a25e8ae7e4d076103483f55f39b426ec55e47d1fe211", - "zh:24a17bba7f6d679538ff51b3a2f378cedadede97af8a1db7dad4fd8d6d50f829", - "zh:30ffb297ffd1633175d6545d37c2217e2cef9545a6e03946e514c59c0859b77d", - "zh:454ce4b3dbc73e6775f2f6605d45cee6e16c3872a2e66a2c97993d6e5cbd7055", + "h1:zG9uFP8l9u+yGZZvi5Te7PV62j50azpgwPunq2vTm1E=", + "zh:04ceb65210251339f07cd4611885d242cd4d0c7306e86dda9785396807c00451", + "zh:448f56199f3e99ff75d5c0afacae867ee795e4dfda6cb5f8e3b2a72ec3583dd8", + "zh:4b4c11ccfba7319e901df2dac836b1ae8f12185e37249e8d870ee10bb87a13fe", + "zh:4fa45c44c0de582c2edb8a2e054f55124520c16a39b2dfc0355929063b6395b1", + "zh:588508280501a06259e023b0695f6a18149a3816d259655c424d068982cbdd36", + "zh:737c4d99a87d2a4d1ac0a54a73d2cb62974ccb2edbd234f333abd079a32ebc9e", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:91df0a9fab329aff2ff4cf26797592eb7a3a90b4a0c04d64ce186654e0cc6e17", - "zh:aa57384b85622a9f7bfb5d4512ca88e61f22a9cea9f30febaa4c98c68ff0dc21", - "zh:c4a3e329ba786ffb6f2b694e1fd41d413a7010f3a53c20b432325a94fa71e839", - "zh:e2699bc9116447f96c53d55f2a00570f982e6f9935038c3810603572693712d0", - "zh:e747c0fd5d7684e5bfad8aa0ca441903f15ae7a98a737ff6aca24ba223207e2c", - "zh:f1ca75f417ce490368f047b63ec09fd003711ae48487fba90b4aba2ccf71920e", + "zh:a357ab512e5ebc6d1fda1382503109766e21bbfdfaa9ccda43d313c122069b30", + "zh:c51bfb15e7d52cc1a2eaec2a903ac2aff15d162c172b1b4c17675190e8147615", + "zh:e0951ee6fa9df90433728b96381fb867e3db98f66f735e0c3e24f8f16903f0ad", + "zh:e3cdcb4e73740621dabd82ee6a37d6cfce7fee2a03d8074df65086760f5cf556", + "zh:eff58323099f1bd9a0bec7cb04f717e7f1b2774c7d612bf7581797e1622613a0", ] } provider "registry.terraform.io/hashicorp/time" { - version = "0.11.1" + version = "0.12.1" constraints = ">= 0.7.1" hashes = [ - "h1:pQGSL9mdgw4qsLndFYsEF93mbsIxyxNoAyIbBqhS3Xo=", - "zh:19a393db736ec4fd024d098d55aefaef07056c37a448ece3b55b3f5f4c2c7e4a", - "zh:227fa1e221de2907f37be78d40c06ca6a6f7b243a1ec33ade014dfaf6d92cd9c", - "zh:29970fecbf4a3ca23bacbb05d6b90cdd33dd379f90059fe39e08289951502d9f", - "zh:65024596f22f10e7dcb5e0e4a75277f275b529daa0bc0daf34ca7901c678ab88", - "zh:694d080cb5e3bf5ef08c7409208d061c135a4f5f4cdc93ea8607860995264b2e", + "h1:JzYsPugN8Fb7C4NlfLoFu7BBPuRVT2/fCOdCaxshveI=", + "zh:090023137df8effe8804e81c65f636dadf8f9d35b79c3afff282d39367ba44b2", + "zh:26f1e458358ba55f6558613f1427dcfa6ae2be5119b722d0b3adb27cd001efea", + "zh:272ccc73a03384b72b964918c7afeb22c2e6be22460d92b150aaf28f29a7d511", + "zh:438b8c74f5ed62fe921bd1078abe628a6675e44912933100ea4fa26863e340e9", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:b29d15d13e1b3412e6a4e1627d378dbd102659132f7488f64017dd6b6d5216d3", - "zh:bb79f4cae9f8c17c73998edc54aa16c2130a03227f7f4e71fc6ac87e230575ec", - "zh:ceccf80e95929d97f62dcf1bb3c7c7553d5757b2d9e7d222518722fc934f7ad5", - "zh:f40e638336527490e294d9c938ae55919069e6987e85a80506784ba90348792a", - "zh:f99ef33b1629a3b2278201142a3011a8489e66d92da832a5b99e442204de18fb", - "zh:fded14754ea46fdecc62a52cd970126420d4cd190e598cb61190b4724a727edb", + "zh:85c8bd8eefc4afc33445de2ee7fbf33a7807bc34eb3734b8eefa4e98e4cddf38", + "zh:98bbe309c9ff5b2352de6a047e0ec6c7e3764b4ed3dfd370839c4be2fbfff869", + "zh:9c7bf8c56da1b124e0e2f3210a1915e778bab2be924481af684695b52672891e", + "zh:d2200f7f6ab8ecb8373cda796b864ad4867f5c255cff9d3b032f666e4c78f625", + "zh:d8c7926feaddfdc08d5ebb41b03445166df8c125417b28d64712dccd9feef136", + "zh:e2412a192fc340c61b373d6c20c9d805d7d3dee6c720c34db23c2a8ff0abd71b", + "zh:e6ac6bba391afe728a099df344dbd6481425b06d61697522017b8f7a59957d44", ] } provider "registry.terraform.io/hashicorp/tls" { - version = "4.0.5" + version = "4.0.6" hashes = [ - "h1:zeG5RmggBZW/8JWIVrdaeSJa0OG62uFX5HY1eE8SjzY=", - "zh:01cfb11cb74654c003f6d4e32bbef8f5969ee2856394a96d127da4949c65153e", - "zh:0472ea1574026aa1e8ca82bb6df2c40cd0478e9336b7a8a64e652119a2fa4f32", - "zh:1a8ddba2b1550c5d02003ea5d6cdda2eef6870ece86c5619f33edd699c9dc14b", - "zh:1e3bb505c000adb12cdf60af5b08f0ed68bc3955b0d4d4a126db5ca4d429eb4a", - "zh:6636401b2463c25e03e68a6b786acf91a311c78444b1dc4f97c539f9f78de22a", - "zh:76858f9d8b460e7b2a338c477671d07286b0d287fd2d2e3214030ae8f61dd56e", - "zh:a13b69fb43cb8746793b3069c4d897bb18f454290b496f19d03c3387d1c9a2dc", - "zh:a90ca81bb9bb509063b736842250ecff0f886a91baae8de65c8430168001dad9", - "zh:c4de401395936e41234f1956ebadbd2ed9f414e6908f27d578614aaa529870d4", - "zh:c657e121af8fde19964482997f0de2d5173217274f6997e16389e7707ed8ece8", - "zh:d68b07a67fbd604c38ec9733069fbf23441436fecf554de6c75c032f82e1ef19", + "h1:n3M50qfWfRSpQV9Pwcvuse03pEizqrmYEryxKky4so4=", + "zh:10de0d8af02f2e578101688fd334da3849f56ea91b0d9bd5b1f7a243417fdda8", + "zh:37fc01f8b2bc9d5b055dc3e78bfd1beb7c42cfb776a4c81106e19c8911366297", + "zh:4578ca03d1dd0b7f572d96bd03f744be24c726bfd282173d54b100fd221608bb", + "zh:6c475491d1250050765a91a493ef330adc24689e8837a0f07da5a0e1269e11c1", + "zh:81bde94d53cdababa5b376bbc6947668be4c45ab655de7aa2e8e4736dfd52509", + "zh:abdce260840b7b050c4e401d4f75c7a199fafe58a8b213947a258f75ac18b3e8", + "zh:b754cebfc5184873840f16a642a7c9ef78c34dc246a8ae29e056c79939963c7a", + "zh:c928b66086078f9917aef0eec15982f2e337914c5c4dbc31dd4741403db7eb18", + "zh:cded27bee5f24de6f2ee0cfd1df46a7f88e84aaffc2ecbf3ff7094160f193d50", + "zh:d65eb3867e8f69aaf1b8bb53bd637c99c6b649ba3db16ded50fa9a01076d1a27", + "zh:ecb0c8b528c7a619fa71852bb3fb5c151d47576c5aab2bf3af4db52588722eeb", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", ] } diff --git a/tf/environments/prod/dns_records.tf b/tf/environments/prod/dns_records.tf index c2d680a7..61852e28 100644 --- a/tf/environments/prod/dns_records.tf +++ b/tf/environments/prod/dns_records.tf @@ -862,14 +862,6 @@ resource "aws_route53_record" "test-qemu-infra-ooni-io-_A_" { zone_id = local.dns_root_zone_ooni_io } -resource "aws_route53_record" "wcth-ooni-io-_A_" { - name = "wcth.ooni.io" - records = ["37.218.245.117"] - ttl = "60" - type = "A" - zone_id = local.dns_root_zone_ooni_io -} - resource "aws_route53_record" "www-ooni-io-_CNAME_" { name = "www.ooni.io" records = ["ooni.netlify.com"] @@ -886,30 +878,6 @@ resource "aws_route53_record" "ams-ps-ooni-nu-_A_" { zone_id = local.dns_root_zone_ooni_nu } -resource "aws_route53_record" "ams-wcth-ooni-nu-_A_" { - name = "ams-wcth.ooni.nu" - records = ["37.218.245.114"] - ttl = "300" - type = "A" - zone_id = local.dns_root_zone_ooni_nu -} - -resource "aws_route53_record" "ams-wcth2-ooni-nu-_A_" { - name = "ams-wcth2.ooni.nu" - records = ["37.218.247.47"] - ttl = "300" - type = "A" - zone_id = local.dns_root_zone_ooni_nu -} - -resource "aws_route53_record" "ams-wcth3-ooni-nu-_A_" { - name = "ams-wcth3.ooni.nu" - records = ["37.218.245.117"] - ttl = "300" - type = "A" - zone_id = local.dns_root_zone_ooni_nu -} - resource "aws_route53_record" "amsmatomo-ooni-nu-_A_" { name = "amsmatomo.ooni.nu" records = ["37.218.242.173"] @@ -1005,3 +973,51 @@ resource "aws_route53_record" "notebook-ooni-org-_A_" { type = "A" zone_id = local.dns_root_zone_ooni_org } + +resource "aws_route53_record" "data1-htz-fsn-prod-ooni-nu-_a_" { + name = "data1.htz-fsn.prod.ooni.nu" + records = ["142.132.254.225"] + ttl = "60" + type = "A" + zone_id = local.dns_zone_ooni_nu +} + +resource "aws_route53_record" "data3-htz-fsn-prod-ooni-nu-_A_" { + name = "data3.htz-fsn.prod.ooni.nu" + records = ["168.119.7.188"] + ttl = "60" + type = "A" + zone_id = local.dns_zone_ooni_nu +} + +resource "aws_route53_record" "clickhouse1-prod-ooni-io-_a_" { + name = "clickhouse1.prod.ooni.io" + records = ["142.132.254.225"] + ttl = "60" + type = "A" + zone_id = local.dns_zone_ooni_io +} + +resource "aws_route53_record" "clickhouse2-prod-ooni-io-_A_" { + name = "clickhouse2.prod.ooni.io" + records = ["88.198.54.12"] + ttl = "60" + type = "A" + zone_id = local.dns_zone_ooni_io +} + +resource "aws_route53_record" "clickhouse3-prod-ooni-io-_A_" { + name = "clickhouse3.prod.ooni.io" + records = ["168.119.7.188"] + ttl = "60" + type = "A" + zone_id = local.dns_zone_ooni_io +} + +resource "aws_route53_record" "airflow-prod-ooni-io-_a_" { + name = "airflow.prod.ooni.io" + records = ["142.132.254.225"] + ttl = "60" + type = "A" + zone_id = local.dns_zone_ooni_io +} diff --git a/tf/environments/prod/main.tf b/tf/environments/prod/main.tf index 70e0ae5d..d9152ae5 100644 --- a/tf/environments/prod/main.tf +++ b/tf/environments/prod/main.tf @@ -42,6 +42,18 @@ provider "aws" { data "aws_availability_zones" "available" {} +data "aws_secretsmanager_secret" "do_token" { + name = "oonidevops/digitalocean_access_token" +} + +data "aws_secretsmanager_secret_version" "do_token_version" { + secret_id = data.aws_secretsmanager_secret.do_token.id +} + +provider "digitalocean" { + token = data.aws_secretsmanager_secret_version.do_token_version.secret_string +} + ### !!! IMPORTANT !!! # The first time you run terraform for a new environment you have to setup the # required roles in AWS. @@ -74,7 +86,9 @@ module "adm_iam_roles" { authorized_accounts = [ "arn:aws:iam::${local.ooni_main_org_id}:user/art", - "arn:aws:iam::${local.ooni_main_org_id}:user/mehul" + "arn:aws:iam::${local.ooni_main_org_id}:user/luis", + "arn:aws:iam::${local.ooni_main_org_id}:user/mehul", + "arn:aws:iam::${local.ooni_main_org_id}:user/tony" ] } @@ -147,8 +161,8 @@ module "oonipg" { vpc_id = module.network.vpc_id subnet_ids = module.network.vpc_subnet_public[*].id db_instance_class = "db.t3.micro" - db_storage_type = "standard" - db_allocated_storage = "5" + db_storage_type = "gp3" + db_allocated_storage = "50" db_max_allocated_storage = null tags = merge( local.tags, @@ -275,6 +289,7 @@ module "ooni_th_droplet" { "3d:81:99:17:b5:d1:20:a5:fe:2b:14:96:67:93:d6:34", "f6:4b:8b:e2:0e:d2:97:c5:45:5c:07:a6:fe:54:60:0e" ] + dns_zone_ooni_io = local.dns_zone_ooni_io } @@ -303,6 +318,58 @@ module "ooni_backendproxy" { ) } +module "ooniapi_reverseproxy_deployer" { + source = "../../modules/ooniapi_service_deployer" + + service_name = "reverseproxy" + repo = "ooni/backend" + branch_name = "master" + buildspec_path = "ooniapi/services/reverseproxy/buildspec.yml" + codestar_connection_arn = aws_codestarconnections_connection.oonidevops.arn + + codepipeline_bucket = aws_s3_bucket.ooniapi_codepipeline_bucket.bucket + + ecs_service_name = module.ooniapi_reverseproxy.ecs_service_name + ecs_cluster_name = module.ooniapi_cluster.cluster_name +} + +module "ooniapi_reverseproxy" { + source = "../../modules/ooniapi_service" + + task_memory = 64 + + # First run should be set on first run to bootstrap the task definition + # first_run = true + + vpc_id = module.network.vpc_id + public_subnet_ids = module.network.vpc_subnet_public[*].id + private_subnet_ids = module.network.vpc_subnet_private[*].id + + service_name = "reverseproxy" + default_docker_image_url = "ooni/api-reverseproxy:latest" + stage = local.environment + dns_zone_ooni_io = local.dns_zone_ooni_io + key_name = module.adm_iam_roles.oonidevops_key_name + ecs_cluster_id = module.ooniapi_cluster.cluster_id + + task_secrets = { + PROMETHEUS_METRICS_PASSWORD = aws_secretsmanager_secret_version.prometheus_metrics_password.arn + } + + task_environment = { + TARGET_URL = "https://backend-fsn.ooni.org/" + } + + ooniapi_service_security_groups = [ + module.ooniapi_cluster.web_security_group_id + ] + + tags = merge( + local.tags, + { Name = "ooni-tier0-reverseproxy" } + ) +} + ### OONI Services Clusters module "ooniapi_cluster" { @@ -314,11 +381,11 @@ module "ooniapi_cluster" { subnet_ids = module.network.vpc_subnet_public[*].id # You need be careful how these are tweaked. - asg_min = 3 + asg_min = 2 asg_max = 8 - asg_desired = 3 + asg_desired = 2 - instance_type = "t3.micro" + instance_type = "t3a.medium" tags = merge( local.tags, @@ -451,7 +518,7 @@ module "ooniapi_oonifindings_deployer" { module "ooniapi_oonifindings" { source = "../../modules/ooniapi_service" - first_run = true + # first_run = true vpc_id = module.network.vpc_id public_subnet_ids = module.network.vpc_subnet_public[*].id private_subnet_ids = module.network.vpc_subnet_private[*].id @@ -499,7 +566,7 @@ module "ooniapi_ooniauth_deployer" { module "ooniapi_ooniauth" { source = "../../modules/ooniapi_service" - #first_run = true + # first_run = true vpc_id = module.network.vpc_id private_subnet_ids = module.network.vpc_subnet_private[*].id @@ -557,7 +624,7 @@ module "ooniapi_frontend" { vpc_id = module.network.vpc_id subnet_ids = module.network.vpc_subnet_public[*].id - oonibackend_proxy_target_group_arn = module.ooni_backendproxy.alb_target_group_id + oonibackend_proxy_target_group_arn = module.ooniapi_reverseproxy.alb_target_group_id ooniapi_oonirun_target_group_arn = module.ooniapi_oonirun.alb_target_group_id ooniapi_ooniauth_target_group_arn = module.ooniapi_ooniauth.alb_target_group_id ooniapi_ooniprobe_target_group_arn = module.ooniapi_ooniprobe.alb_target_group_id @@ -598,6 +665,7 @@ locals { "ooniauth.${local.environment}.ooni.io" : local.dns_zone_ooni_io, "ooniprobe.${local.environment}.ooni.io" : local.dns_zone_ooni_io, "oonirun.${local.environment}.ooni.io" : local.dns_zone_ooni_io, + "oonifindings.${local.environment}.ooni.io" : local.dns_zone_ooni_io, } ooniapi_frontend_main_domain_name = "api.${local.environment}.ooni.io" ooniapi_frontend_main_domain_name_zone_id = local.dns_zone_ooni_io diff --git a/tf/environments/prod/versions.tf b/tf/environments/prod/versions.tf index 682191e7..3c3ed712 100644 --- a/tf/environments/prod/versions.tf +++ b/tf/environments/prod/versions.tf @@ -6,5 +6,9 @@ terraform { source = "hashicorp/aws" version = ">= 4.66.1" } + digitalocean = { + source = "digitalocean/digitalocean" + version = "~> 2.0" + } } } diff --git a/tf/modules/ooni_backendproxy/main.tf b/tf/modules/ooni_backendproxy/main.tf index ad5b9bec..110461d3 100644 --- a/tf/modules/ooni_backendproxy/main.tf +++ b/tf/modules/ooni_backendproxy/main.tf @@ -12,16 +12,16 @@ resource "aws_security_group" "nginx_sg" { ingress { protocol = "tcp" - from_port = 80 - to_port = 80 - cidr_blocks = ["0.0.0.0/0"] + from_port = 9000 + to_port = 9000 + cidr_blocks = var.private_subnet_cidr } ingress { protocol = "tcp" - from_port = 9000 - to_port = 9000 - cidr_blocks = var.private_subnet_cidr + from_port = 80 + to_port = 80 + cidr_blocks = ["0.0.0.0/0"] } ingress { @@ -132,7 +132,7 @@ resource "aws_lb_target_group_attachment" "oonibackend_proxy" { resource "aws_route53_record" "clickhouse_proxy_alias" { zone_id = var.dns_zone_ooni_io - name = "clickhouse.${var.stage}.ooni.io" + name = "clickhouseproxy.${var.stage}.ooni.io" type = "CNAME" ttl = 300 diff --git a/tf/modules/ooni_th_droplet/main.tf b/tf/modules/ooni_th_droplet/main.tf index b62b47e9..9836ac62 100644 --- a/tf/modules/ooni_th_droplet/main.tf +++ b/tf/modules/ooni_th_droplet/main.tf @@ -34,6 +34,7 @@ resource "digitalocean_droplet" "ooni_th_docker" { lifecycle { create_before_destroy = true + ignore_changes = all } } resource "aws_route53_record" "ooni_th" { diff --git a/tf/modules/ooni_th_droplet/templates/cloud-init-docker.yml b/tf/modules/ooni_th_droplet/templates/cloud-init-docker.yml index 4f82bcc6..93135daf 100644 --- a/tf/modules/ooni_th_droplet/templates/cloud-init-docker.yml +++ b/tf/modules/ooni_th_droplet/templates/cloud-init-docker.yml @@ -103,7 +103,9 @@ write_files: content: | { "ipv6": true, - "fixed-cidr-v6": "2001:db8:1::/64" + "fixed-cidr-v6": "2001:db8:1::/64", + "log-driver": "json-file", + "log-opts": {"max-size": "100m", "max-file": "3"} } - path: /etc/nginx/sites-available/default diff --git a/tf/modules/ooniapi_frontend/main.tf b/tf/modules/ooniapi_frontend/main.tf index dd6e428b..be321362 100644 --- a/tf/modules/ooniapi_frontend/main.tf +++ b/tf/modules/ooniapi_frontend/main.tf @@ -182,7 +182,12 @@ resource "aws_lb_listener_rule" "ooniapi_oonifindings_rule" { condition { path_pattern { - values = ["/api/v1/incidents/*"] + values = [ + "/api/v1/incidents/*", + "/api/v1/aggregation/*", + "/api/v1/observations", + "/api/v1/analysis", + ] } } } diff --git a/tf/modules/ooniapi_service/main.tf b/tf/modules/ooniapi_service/main.tf index ad429a01..c5def884 100644 --- a/tf/modules/ooniapi_service/main.tf +++ b/tf/modules/ooniapi_service/main.tf @@ -40,11 +40,6 @@ resource "aws_cloudwatch_log_group" "ooniapi_service" { name = "ooni-ecs-group/${local.name}" } - -locals { - container_port = 80 -} - // This is done to retrieve the image name of the current task definition // It's important to keep aligned the container_name and task_definitions data "aws_ecs_container_definition" "ooniapi_service_current" { @@ -59,18 +54,17 @@ resource "aws_ecs_task_definition" "ooniapi_service" { container_definitions = jsonencode([ { - cpu = var.task_cpu, + memoryReservation = var.task_memory, essential = true, image = try( data.aws_ecs_container_definition.ooniapi_service_current[0].image, var.default_docker_image_url ), - memory = var.task_memory, name = local.name, portMappings = [ { - containerPort = local.container_port, + containerPort = 80 } ], diff --git a/tf/modules/ooniapi_service/templates/profile_policy.json b/tf/modules/ooniapi_service/templates/profile_policy.json index 5857ee55..3a772893 100644 --- a/tf/modules/ooniapi_service/templates/profile_policy.json +++ b/tf/modules/ooniapi_service/templates/profile_policy.json @@ -35,6 +35,16 @@ "Action": "secretsmanager:ListSecrets", "Resource": "*" }, + { + "Effect": "Allow", + "Action": [ + "ssm:GetParameter", + "ssm:GetParameters", + "ssm:GetParameterHistory", + "ssm:GetParametersByPath" + ], + "Resource": "arn:aws:ssm:*" + }, { "Effect": "Allow", "Action": [ diff --git a/tf/modules/ooniapi_service/variables.tf b/tf/modules/ooniapi_service/variables.tf index f83e16d7..bda90a72 100644 --- a/tf/modules/ooniapi_service/variables.tf +++ b/tf/modules/ooniapi_service/variables.tf @@ -44,13 +44,8 @@ variable "service_desired_count" { default = 1 } -variable "task_cpu" { - default = 256 - description = "https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_definition_parameters.html#task_size" -} - variable "task_memory" { - default = 512 + default = 64 description = "https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_definition_parameters.html#task_size" } @@ -79,4 +74,4 @@ variable "task_environment" { variable "ooniapi_service_security_groups" { description = "the shared web security group from the ecs cluster" type = list(string) -} +} \ No newline at end of file