-
Notifications
You must be signed in to change notification settings - Fork 9
/
deep_qcg_job.yaml
151 lines (141 loc) · 4.97 KB
/
deep_qcg_job.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
tosca_definitions_version: tosca_simple_yaml_1_0
imports:
- indigo_custom_types: https://raw.githubusercontent.com/indigo-dc/tosca-types/master/custom_types.yaml
description: Submit a DEEP-OC job to HPC batch cluster (using QCG gateway)
metadata:
display_name: DEEP HPC Job
icon: https://github.com/indigo-dc/tosca-types/raw/master/images/qcg-logo.png
tag: hpc
topology_template:
inputs:
docker_image:
type: string
description: Docker image to deploy from Docker Hub
default: deephdc/deep-oc-benchmarks_cnn:cpu
required: no
run_command:
type: string
description: Command to run inside the container
required: no
default: 'deepaas-cli get_metadata'
recreate_container:
type: boolean
description: If true, enforce creating container, even if it already exists
default: false
required: no
udocker_extra_options:
type: string
description: Any other possible option for udocker (e.g. "-v /tmp:/tmp" to mount host /tmp at /tmp inside the container)
required: no
default: ''
onedata_zone:
type: string
description: OneData zone (e.g. https://onezone-beta.cloud.ba.infn.it, https://onezone.cloud.cnaf.infn.it)
required: yes
default: "https://onezone-beta.cloud.ba.infn.it"
onedata_provider:
type: string
description: OneData provider to use (e.g. oneprovider-beta-2.cloud.ba.infn.it; oprov.ifca.es, oneprovider.fedcloud.eu, cloud-90-147-75-163.cloud.ba.infn.it)
required: yes
default: "oneprovider-beta-2.cloud.ba.infn.it"
onedata_space_name:
type: string
description: Name of OneData space
required: yes
onedata_mount_point:
type: string
description: Mount point for OneData inside the container (ONEDATA_MOUNT_POINT)
required: no
default: "/mnt/onedata"
rclone_conf_host:
type: string
description: Full path to rclone.conf on the host
required: no
default: ""
rclone_conf_container:
type: string
description: Where rclone.conf is expected inside the container (full path)
required: no
default: "/srv/.rclone/rclone.conf"
app_in_out_base_dir:
type: string
description: Base directory inside container for input and output data (e.g. data, models)
required: no
default: ''
jupyter_password:
type: string
description: Password for JupyterLab (at least 9 characters!)
required: no
default: ""
std_outerr:
type: string
description: HPC job output (located in user's HOME)
required: no
default: 'log.txt'
total_cores:
type: integer
required: no
default: 2
num_gpus:
type: integer
description: Number of required GPUs
required: no
default: 0
total_nodes:
type: integer
required: no
default: 1
cores_per_node:
type: integer
required: no
default: 2
queue:
type: string
description: Name of the HPC partition (e.g. standard, tesla)
required: no
default: standard
schedule_extra_options:
type: list
entry_schema:
type: string
description: extra options for the batch system
required: no
default: []
# wall_clock:
# type: string
# required: no
# default: "04:00:00"
node_templates:
onedata_space:
type: tosca.nodes.indigo.OnedataSpace
properties:
onezone: { get_input: onedata_zone }
space: { get_input: onedata_space_name }
oneproviders: [ { get_input: onedata_provider } ]
qcg_job:
type: tosca.nodes.indigo.Qcg.Job
properties:
note: "Script to submit a deep-oc application"
executable: "curl -s https://raw.githubusercontent.com/deephdc/deep-hpc/qcg/deep-slurm-qcg.sh | bash"
std_outerr: { get_input: std_outerr }
directory: "${HOME}"
total_cores: { get_input: total_cores }
total_nodes: { get_input: total_nodes }
cores_per_node: { get_input: cores_per_node }
gpus: { get_input: num_gpus }
queue: { get_input: queue }
batch_system_options: { get_input: schedule_extra_options }
#wall_clock: { get_input: wall_clock }
environment:
DOCKER_IMAGE: { get_input: docker_image }
UDOCKER_RECREATE: { get_input: recreate_container }
UDOCKER_EXTRA_OPTIONS: { get_input: udocker_extra_options }
ONECLIENT_ACCESS_TOKEN: { get_attribute : [ onedata_space, token ] }
ONECLIENT_PROVIDER_HOST: "oneprovider-beta-2.cloud.ba.infn.it"
ONEDATA_MOUNT_POINT: { get_input: onedata_mount_point }
RCLONE_CONF_HOST: { get_input: rclone_conf_host }
RCLONE_CONF_CONTAINER: { get_input: rclone_conf_container }
APP_IN_OUT_BASE_DIR: { get_input: app_in_out_base_dir }
UDOCKER_RUN_COMMAND: { get_input: run_command }
NUM_GPUS: { get_input: num_gpus }
jupyterPASSWORD: {get_input: jupyter_password}