-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathmuchos.props.example
344 lines (331 loc) · 16.3 KB
/
muchos.props.example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[general]
# Cluster type (Azure, ec2, or existing)
cluster_type = ec2
# Cluster user name (install command will SSH to cluster using this user)
# Leave default below if launching cluster in AWS
cluster_user = fedora
# Cluster user group
cluster_group = %(cluster_user)s
# Cluster user home directory
user_home = /home/%(cluster_user)s
# Install directory where Hadoop, Accumulo, etc will be installed
install_dir = %(user_home)s/install
# Hostname of proxy node that Muchos will use to direct installation of cluster. Will be given
# public IP if launching in EC2. If not launching in EC2, node must have public IP that can be reached
# from your machine. Hostname can be chosen from "nodes" section below.
proxy_hostname = leader1
# If set, a SOCKS proxy will be created on the specified port when connecting to proxy using 'muchos ssh <cluster>'
proxy_socks_port = 38585
# Accumulo Instance name
accumulo_instance = muchos
# Accumulo Password
accumulo_password = secret
# Software versions (make sure you have a corresponding entry for the checksum in conf/checksums)
hadoop_version = 3.3.6
zookeeper_version = 3.8.4
spark_version = 3.4.4
fluo_version = 1.2.0
fluo_yarn_version = 1.0.0
accumulo_version = 2.1.3
# Specifies if software should be downloaded. If 'False', tarballs of the software above should be in conf/upload/
download_software = True
# Install Hub (for GitHub)
install_hub = True
# The java package to install - by default OpenJDK 11. If you need to use 1.8, please change this.
java_package=java-11-openjdk-devel
# The package to use for java 1.8
# java_package=java-1.8.0-openjdk-devel
# Please read the High-Availability section of the README before switching to 'True'
hdfs_ha = False
# Give a logical name for the cluster, all one word no special characters. Required to support HDFS HA.
nameservice_id = muchoshacluster
# number of accumulo tablet servers to run per host. This is only effective when systemd is set to 'True'. Needs changes in
# accumulo-service script in order to support non-systemd cases.
num_tservers = 1
# If accumulo services are to be run under systemd, set this to 'True'
use_systemd = False
# ELK stack
elasticsearch_version = 7.10.2
kibana_version = 7.10.2
filebeat_version = 7.10.2
logstash_version = 7.10.2
[ec2]
# AWS machine image to use. The default below is for a Fedora image (in us-east-1).
# You may need to change this value if a new image has been released or you are running in a different region.
aws_ami = ami-08b4ee602f76bff79
# Type of AWS instance launched by default
default_instance_type = m5d.large
# Type of AWS instance launched for any node running 'worker' service
# Leave default below to use same instance type set by 'default_instance_type' property
worker_instance_type = %(default_instance_type)s
# Enable template mode by selecting a template from conf/templates, in order to leverage your own
# custom EC2 launch requests (optional). See conf/templates/README.md for more information
#cluster_template = example
# VPC to launch instances in (optional)
#vpc_id = vpc-xxxxx
# VPC Subnet to launch instances in (optional)
#subnet_id = subnet-xxxxxx
# Security group ID to launch in (optional)
#security_group_id = sg-xxxxxx
# Name of public key that will be loaded by Amazon on to your EC2 instances.
# You can upload and name your public key using the EC2 Management Console.
# Only the user with this key will be able to SSH to the cluster.
# Name below should be your 'Key pair name' in EC2 and not name of your public key file.
key_name = my_aws_key
# Type of filesystem to format instance storage as.
fstype = ext4
# Force formatting of instance devices, even when it has an existing filesystem.
force_format = no
# Tags to add instances
#instance_tags = key1:value1,key2:value2
# Nodes will be given public IP addresses if true
associate_public_ip = true
# Path to file containing user data that will be executed at launch
#user_data_path = /path/to/user_data
# Shutdown instances after a delay (in minutes). If 0, no shutdown will occur.
shutdown_delay_minutes = 0
# Shutdown behavior of EC2 instances: terminate or stop
shutdown_behavior = stop
[azure]
# Azure subscription ID
azure_subscription_id =
# Name of the Azure resource group to use. All resources are created within this
# resource group. If a resource group with this name already exists, it will be
# used, otherwise a new resource group with this name will be created and used.
resource_group = accumulo-rg
# Name of the Azure Virtual Network (VNET) to use. If a VNET with this name
# already exists, it will be used; else a new VNET with this name will be created.
vnet = vnet1
# The CIDR prefix used to create the virtual network (VNET).
vnet_cidr = 10.0.0.0/8
# A single subnet is created within the VNET and given the following name.
subnet = subnet1
# The CIDR prefix used for the single subnet within the virtual network.
subnet_cidr = 10.1.0.0/16
#Optional. If set to True, will create multiple VMSS based on multiple_vmss_vars.yml
use_multiple_vmss = False
# The below "vmss_priority" controls whether Azure Spot is used (or not).
# When vmss_priority is set to Spot, the cluster uses Azure Spot VM Scale Sets.
# When vmss_priority is set to None (which is the default), Azure Spot is NOT used.
# Note that for multiple-VMSS deployments, this setting can be overridden at a
# per-VMSS level by specifying vmss_priority in the conf/azure_multiple_vmss_vars.yml file.
vmss_priority = None
# Azure image reference defined as a pipe-delimited string in the format offer|publisher|sku|version|image_id|
# Please refer 'Launching an Azure cluster' section of the README before making changes
azure_image_reference = almalinux-x86_64|almalinux|9-gen2|latest||
# Image payment plan information - values required only if the image requires payment plan info.
# The format of this configuration line is plan_name|product|publisher|
azure_image_plan = |||
# Cloud init file to use when creating VMs with the above image reference.
# Currently, an Alma Linux 9 specific cloud-init file is used. Different files can be used if needed.
azure_image_cloud_init_file = cloud-init-alma9.yml
# Azure image reference defined as a pipe-delimited string in the format offer|publisher|sku|version|image_id|
# This is the image that will be used for the proxy machine (if specified by azure_proxy_host). If
# this is not set, then the value of azure_image_reference will be used on the proxy.
# Please refer 'Launching an Azure cluster' section of the README before making changes
#azure_proxy_image_reference = almalinux-x86_64|almalinux|9-gen2|latest||
# Proxy image payment plan information - required only if the proxy image requires payment plan info.
# If this is not set, the value of azure_image_plan will be used on the proxy.
# The format of this configuration line is plan_name|product|publisher|
# azure_proxy_image_plan = |||
# Optional cloud init file to use when creating VMs with the above proxy image reference. If this is not set,
# the value of azure_image_cloud_init_file will be used.
# azure_proxy_image_cloud_init_file =
# Optional OS disk size in GB. If not specified, the size of the OS disk will be as defined in the VM image
# os_disk_size_gb = 48
# Size of the cluster to provision.
# A virtual machine scale set (VMSS) with these many VMs will be created.
# The minimum allowed size for this is 3 nodes for non-HA & 4 nodes for HA setup
numnodes = 8
# The size of each virtual machine. See the following link for other sizes:
# https://learn.microsoft.com/en-us/azure/virtual-machines/linux/sizes-general
vm_sku = Standard_D8s_v3
# Each VM will be provisioned with the following type of managed disk
# The azure_disk_device* parameters below specify the Linux device paths Muchos looks for when selecting disks for storage
# The default values below are for using Azure managed disks
azure_disk_device_path = /dev/disk/azure/scsi1
azure_disk_device_pattern = lun*
# If using Azure Lsv2 or Lsv3 VMs which have NVME disks for ephemeral storage, use the parameters below instead of the defaults
# azure_disk_device_path = /dev
# azure_disk_device_pattern = nvme*n1
# Type of the data disk attached to the VMSS. 'Standard_LRS' for HDD, 'Premium_LRS' for SSD, 'StandardSSD_LRS' for Standard SSD
data_disk_sku = Standard_LRS
# Number of managed disks provisioned on each VM
data_disk_count = 3
# The size of each managed disk provisioned
disk_size_gb = 128
# Indicates the host caching that should be used for data disks. Valid values are ReadOnly, ReadWrite, or None
data_disk_caching = ReadOnly
# Location to mount managed disks in each VM
mount_root = /var/data
# Location where the metrics data will be written
metrics_drive_root = var-data
# Optional proxy VM. If not set, the first node of the cluster will be selected as the proxy.
azure_proxy_host =
# Azure VM SKU to use when creating the proxy host - defaults to a 8-vCore general-purpose VM
azure_proxy_host_vm_sku = Standard_D8s_v3
# The Azure datacenter location to use for creating Muchos resources
location = westus2
# Enable ADLS Gen2 storage configuration. Muchos parameters instance_volumes_input, instance_volumes_adls & adls_storage_type is not required if use_adlsg2 is false.
use_adlsg2 = False
# Storage accounts can be auto generated or manually specified. "|" is used as separator between manual and auto generated storage account names and must be specified
# Manual and Auto generated names are mutually exclusive
#
# Specifying storage accounts manually:
# |abfss://<container-name>@<storage-account-name>.<domain-name>/<folder-name>". Use comma to specify multiple entries
# Example:|abfss://accumulodata@shnawastore1.dfs.core.windows.net/accumulo,abfss://accumulodata@shnawastore2.dfs.core.windows.net/accumulo
#
# Specifying auto-generated storage accounts:
# <Number-of-Storage-Accounts>,<domain-name>|
# Example: 3,dfs.core.windows.net|
instance_volumes_input = 1,dfs.core.windows.net|
# Do not update "instance_volumes_adls", it will be populated dynamically during launch phase of muchos
instance_volumes_adls =
# Type of storage for ADLS Gen2 storage accounts
adls_storage_type = Standard_LRS
# Specify user assigned identity name. "{{ vmss_name }}-ua-msi" will be created if value is not provided
user_assigned_identity =
# Do not update "azure_tenant_id", it will be populated dynamically during launch phase of muchos
azure_tenant_id =
# Do not update "azure_client_id", it will be populated dynamically during launch phase of muchos
azure_client_id =
# Do not update "principal_id", it will be populated dynamically during launch phase of muchos when "use_hdfs = False"
principal_id =
# Optional Azure fileshare to mount on all nodes.
# Path and credentials must be updated to enable this.
#azure_fileshare_mount = /mnt/azure-fileshare
#azure_fileshare = //fileshare-to-mount.file.core.windows.net/path
#azure_fileshare_username = fs_username
#azure_fileshare_password = fs_password
# Optional integration with Azure Log Analytics
# A workspace will be created and az_logs_resource_id, az_logs_id, az_logs_key
# will be updated here by the launch command. If you wish to use an existing
# workspace in another resource group, then fill in all 3 of these values before
# running launch.
# For details on how to get a workspace ID and key, see the link below
# https://docs.microsoft.com/en-us/azure/azure-monitor/learn/quick-collect-linux-computer#obtain-workspace-id-and-key
az_oms_integration_needed = False
#az_logs_resource_id =
#az_logs_id =
#az_logs_key =
# Set to true to enable Azure Application Insights integration
# An Application Insights instance will be created by the launch command and its
# connection string will be updated here. If you wish to use a different
# Application Insights instance, then set az_appinsights_connection_string to
# its connection string before invoking launch.
az_use_app_insights = False
az_app_insights_version = 3.2.1
#az_appinsights_connection_string =
[existing]
# Root of data dirs
mount_root = /var/data
# Data directories on all nodes
data_dirs = /var/data1,/var/data2,/var/data3
# Identifies drives for metrics
metrics_drive_ids = var-data1,var-data2,var-data3
[performance]
# Automatically tune Accumulo, Yarn, and Fluo performance setting by selecting or
# creating a performance profile. Try not to use more memory than each node has
# and leave some space for the OS.
profile=perf-small
# Below are different performance profiles that can be selected. Each profile
# has the same properties with different values.
[perf-small]
# Amount of JVM heap for each tserver
accumulo_tserv_mem=2G
# Amount of data cache for each tserver. Only applies when using Accumulo 1.x
accumulo_dcache_size=768M
# Amount of index cache for each tserver. Only applies when using Accumulo 1.x
accumulo_icache_size=256M
# In memory map size for each tserver. Only applies when using Accumulo 1.x
accumulo_imap_size=512M
# Amount of JVM heap for each Fluo worker
fluo_worker_mem_mb=2048
# Determines the gap between the Yarn memory limit and the java -Xmx setting.
# For example if fluo_worker_mem_mb is set to 2048 and twill_reserve_mem_mb is
# set to 256, then for workers the java -Xmx setting will be set to 2048-256.
# If yarn is killing worker processes because they are using too much memory,
# then consider increasing this setting.
twill_reserve_mem_mb=256
# Number of threads for each Flup worker
fluo_worker_threads=20
# Number of worker to run per node
fluo_worker_instances_multiplier=1
# Max amount of memory for YARN per node
yarn_nm_mem_mb=4096
[perf-medium]
accumulo_tserv_mem=3G
# Accumulo configs below only apply when using Accumulo 1.x
accumulo_dcache_size=1536M
accumulo_icache_size=512M
accumulo_imap_size=512M
fluo_worker_mem_mb=4096
twill_reserve_mem_mb=512
fluo_worker_threads=64
fluo_worker_instances_multiplier=1
yarn_nm_mem_mb=8192
[perf-large]
accumulo_tserv_mem=4G
# Accumulo configs below only apply when using Accumulo 1.x
accumulo_dcache_size=2G
accumulo_icache_size=1G
accumulo_imap_size=512M
fluo_worker_mem_mb=4096
twill_reserve_mem_mb=512
fluo_worker_threads=64
fluo_worker_instances_multiplier=2
yarn_nm_mem_mb=16384
[azd16s]
accumulo_tserv_mem=4G
accumulo_dcache_size=2G
accumulo_icache_size=1G
accumulo_imap_size=512M
fluo_worker_mem_mb=4096
twill_reserve_mem_mb=512
fluo_worker_threads=64
fluo_worker_instances_multiplier=2
yarn_nm_mem_mb=16384
[azd8s]
accumulo_tserv_mem=4G
accumulo_dcache_size=2G
accumulo_icache_size=1G
accumulo_imap_size=512M
fluo_worker_mem_mb=4096
twill_reserve_mem_mb=512
fluo_worker_threads=64
fluo_worker_instances_multiplier=2
yarn_nm_mem_mb=16384
[ansible-vars]
# This section is used to override Ansible variables. Any variable set below will be placed in the hosts file created by Muchos.
# Expected format: variable = value
[nodes]
# If cluster_type=existing, the list of nodes below needs to manually populated with a list of cluster nodes in the following format:
# <Hostname> = <Service1>[,<Service2>,<Service3>]
# Where:
# Hostname = Must be unique. Will be used for hostname in EC2 or should match hostname on your own cluster
# Service = Service to run on node (possible values: zookeeper, namenode, resourcemanager, accumulomaster, client, swarmmanager,
# mesosmaster, worker, fluo, metrics, spark, elkserver). The following services are required: namenode, resourcemanager,
# accumulomaster, zookeeper & worker
# If cluster_type=azure, the list of nodes below is auto-generated by the launch action e.g. "muchos launch --cluster accumuloclstr"
# For the 'azure' cluster type, it is perfectly normal if the auto-generated list of node names is not sequential
# For viewing logging and statistics with Kibana, the elkserver service must be added to a leader node.
leader1 = namenode,resourcemanager,accumulomaster,zookeeper
leader2 = metrics
worker1 = worker,swarmmanager
worker2 = worker
worker3 = worker
worker4 = worker